1. Introduction

This report analyzes a heart disease health indicators dataset and builds two predictive tasks:

The workflow follows a standard data science pipeline: data loading, quality checks, preprocessing, feature engineering, exploratory analysis, correlation screening, train/validation/test split, normalization, modelling, and evaluation.

2. Load Libraries

This section loads the R libraries required for data manipulation, visualization, statistical analysis, and machine learning.

library(tidyverse)
library(forcats)

library(tidyr)
library(ggplot2)
library(dplyr)
library(scales)

3. Load Dataset

The dataset is imported from a CSV file and inspected to understand its structure, variable types, and overall distribution.

df <- read.csv("C://Users//Wei Khang//heartDisease//heart_disease.csv")

head(df)
##   HeartDiseaseorAttack HighBP HighChol CholCheck BMI Smoker Stroke Diabetes
## 1                    0      1        1         1  40      1      0        0
## 2                    0      0        0         0  25      1      0        0
## 3                    0      1        1         1  28      0      0        0
## 4                    0      1        0         1  27      0      0        0
## 5                    0      1        1         1  24      0      0        0
## 6                    0      1        1         1  25      1      0        0
##   PhysActivity Fruits Veggies HvyAlcoholConsump AnyHealthcare NoDocbcCost
## 1            0      0       1                 0             1           0
## 2            1      0       0                 0             0           1
## 3            0      1       0                 0             1           1
## 4            1      1       1                 0             1           0
## 5            1      1       1                 0             1           0
## 6            1      1       1                 0             1           0
##   GenHlth MentHlth PhysHlth DiffWalk Sex Age Education Income
## 1       5       18       15        1   0   9         4      3
## 2       3        0        0        0   0   7         6      1
## 3       5       30       30        1   0   9         4      8
## 4       2        0        0        0   0  11         3      6
## 5       2        3        0        0   0  11         5      4
## 6       2        0        2        0   1  10         6      8
dim(df)
## [1] 253680     22
str(df)
## 'data.frame':    253680 obs. of  22 variables:
##  $ HeartDiseaseorAttack: num  0 0 0 0 0 0 0 0 1 0 ...
##  $ HighBP              : num  1 0 1 1 1 1 1 1 1 0 ...
##  $ HighChol            : num  1 0 1 0 1 1 0 1 1 0 ...
##  $ CholCheck           : num  1 0 1 1 1 1 1 1 1 1 ...
##  $ BMI                 : num  40 25 28 27 24 25 30 25 30 24 ...
##  $ Smoker              : num  1 1 0 0 0 1 1 1 1 0 ...
##  $ Stroke              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Diabetes            : num  0 0 0 0 0 0 0 0 2 0 ...
##  $ PhysActivity        : num  0 1 0 1 1 1 0 1 0 0 ...
##  $ Fruits              : num  0 0 1 1 1 1 0 0 1 0 ...
##  $ Veggies             : num  1 0 0 1 1 1 0 1 1 1 ...
##  $ HvyAlcoholConsump   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AnyHealthcare       : num  1 0 1 1 1 1 1 1 1 1 ...
##  $ NoDocbcCost         : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ GenHlth             : num  5 3 5 2 2 2 3 3 5 2 ...
##  $ MentHlth            : num  18 0 30 0 3 0 0 0 30 0 ...
##  $ PhysHlth            : num  15 0 30 0 0 2 14 0 30 0 ...
##  $ DiffWalk            : num  1 0 1 0 0 0 0 1 1 0 ...
##  $ Sex                 : num  0 0 0 0 0 1 0 0 0 1 ...
##  $ Age                 : num  9 7 9 11 11 10 9 11 9 8 ...
##  $ Education           : num  4 6 4 3 5 6 6 4 5 4 ...
##  $ Income              : num  3 1 8 6 4 8 7 4 1 3 ...
summary(df)
##  HeartDiseaseorAttack     HighBP         HighChol        CholCheck     
##  Min.   :0.00000      Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000      1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.00000      Median :0.000   Median :0.0000   Median :1.0000  
##  Mean   :0.09419      Mean   :0.429   Mean   :0.4241   Mean   :0.9627  
##  3rd Qu.:0.00000      3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.00000      Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :204          NA's   :228     NA's   :213      NA's   :213     
##       BMI            Smoker           Stroke           Diabetes     
##  Min.   :12.00   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:24.00   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :27.00   Median :0.0000   Median :0.00000   Median :0.0000  
##  Mean   :28.38   Mean   :0.4431   Mean   :0.04057   Mean   :0.2969  
##  3rd Qu.:31.00   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :98.00   Max.   :1.0000   Max.   :1.00000   Max.   :2.0000  
##  NA's   :210     NA's   :235      NA's   :198       NA's   :223     
##   PhysActivity        Fruits          Veggies       HvyAlcoholConsump
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000   
##  Median :1.0000   Median :1.0000   Median :1.0000   Median :0.0000   
##  Mean   :0.7565   Mean   :0.6343   Mean   :0.8114   Mean   :0.0562   
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000   
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   
##  NA's   :216      NA's   :214      NA's   :220      NA's   :220      
##  AnyHealthcare     NoDocbcCost         GenHlth         MentHlth     
##  Min.   :0.0000   Min.   :0.00000   Min.   :1.000   Min.   : 0.000  
##  1st Qu.:1.0000   1st Qu.:0.00000   1st Qu.:2.000   1st Qu.: 0.000  
##  Median :1.0000   Median :0.00000   Median :2.000   Median : 0.000  
##  Mean   :0.9511   Mean   :0.08419   Mean   :2.511   Mean   : 3.186  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:3.000   3rd Qu.: 2.000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :5.000   Max.   :30.000  
##  NA's   :225      NA's   :229       NA's   :213     NA's   :216     
##     PhysHlth         DiffWalk           Sex              Age        
##  Min.   : 0.000   Min.   :0.0000   Min.   :0.0000   Min.   : 1.000  
##  1st Qu.: 0.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 6.000  
##  Median : 0.000   Median :0.0000   Median :0.0000   Median : 8.000  
##  Mean   : 4.242   Mean   :0.1682   Mean   :0.4404   Mean   : 8.032  
##  3rd Qu.: 3.000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:10.000  
##  Max.   :30.000   Max.   :1.0000   Max.   :1.0000   Max.   :13.000  
##  NA's   :203      NA's   :231      NA's   :230      NA's   :212     
##    Education         Income     
##  Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:5.000  
##  Median :5.000   Median :7.000  
##  Mean   :5.051   Mean   :6.054  
##  3rd Qu.:6.000   3rd Qu.:8.000  
##  Max.   :6.000   Max.   :8.000  
##  NA's   :211     NA's   :228

4. Data Quality Checks

This section checks common data issues (missing values and duplicates) before cleaning. ### 4.1 Missing Values Missing values are examined at both the column level and the dataset level to determine the extent of incomplete records.

colSums(is.na(df))
## HeartDiseaseorAttack               HighBP             HighChol 
##                  204                  228                  213 
##            CholCheck                  BMI               Smoker 
##                  213                  210                  235 
##               Stroke             Diabetes         PhysActivity 
##                  198                  223                  216 
##               Fruits              Veggies    HvyAlcoholConsump 
##                  214                  220                  220 
##        AnyHealthcare          NoDocbcCost              GenHlth 
##                  225                  229                  213 
##             MentHlth             PhysHlth             DiffWalk 
##                  216                  203                  231 
##                  Sex                  Age            Education 
##                  230                  212                  211 
##               Income 
##                  228
sum(is.na(df))
## [1] 4792

4.2 Duplicate Records

Duplicate rows are identified, as they may bias descriptive statistics and model performance.

sum(duplicated(df))
## [1] 23697

5. Data Preprocessing

Data preprocessing is performed to ensure reliability and consistency of the dataset. This includes removing missing values, eliminating duplicate records, handling outliers, and validating variable ranges.

df_clean <- df

5.1 Remove missing values

df_clean <- df_clean %>% drop_na()

5.2 Remove duplicates

df_clean <- df_clean %>% distinct()

5.3 Handle BMI outliers (Winsorization)

BMI values are capped at the 1st and 99th percentiles to reduce the influence of extreme outliers.

bmi_q <- quantile(df_clean$BMI, probs = c(0.01, 0.99), na.rm = TRUE)

df_clean <- df_clean %>%
mutate(BMI_capped = pmin(pmax(BMI, bmi_q[1]), bmi_q[2]))

5.4 Recode General Health Variable

The original GenHlth variable is recorded as an ordinal scale from 1 to 5.
It is recoded into an ordered factor with meaningful labels to improve interpretability during analysis and visualization.

df_clean <- df_clean %>%
  mutate(
    GenHlth_Factor = factor(GenHlth,
                            levels = c(1,2,3,4,5),
                            labels = c("Excellent","Very Good","Good","Fair","Poor"),
                            ordered = TRUE
    )
  )

5.5 Validate Mental and Physical Health Day Ranges

The MentHlth and PhysHlth variables represent the number of unhealthy days within the past 30 days. Observations exceeding this valid range are identified and removed to ensure data integrity.

# -----------------------
# Count invalid values
# -----------------------
invalid_ment <- sum(df_clean$MentHlth > 30, na.rm = TRUE)
invalid_phys <- sum(df_clean$PhysHlth > 30, na.rm = TRUE)

cat("Number of rows with MentHlth > 30:", invalid_ment, "\n")
## Number of rows with MentHlth > 30: 0
cat("Number of rows with PhysHlth > 30:", invalid_phys, "\n")
## Number of rows with PhysHlth > 30: 0
# -----------------------
# Total invalid rows
# -----------------------
invalid_total <- sum((df_clean$MentHlth > 30) | (df_clean$PhysHlth > 30), na.rm = TRUE)
cat("Total rows to be removed due to invalid health day values:", invalid_total, "\n")
## Total rows to be removed due to invalid health day values: 0
# -----------------------
# Remove invalid rows
# -----------------------
df_clean <- df_clean %>%
  filter(MentHlth <= 30, PhysHlth <= 30)

6. Feature Engineering

Feature engineering improves interpretability and may increase predictive performance by combining multiple related indicators into higher-level features.

6.1 BMI Category

BMI is grouped into standard clinical categories (Underweight, Normal, Overweight, Obese) to support categorical comparisons in EDA and modelling.

df_clean <- df_clean %>%
  mutate(
    # BMI categorical group
    BMI_Category = cut(
      BMI,
      breaks = c(-Inf, 18.5, 25, 30, Inf),
      labels = c("Underweight", "Normal", "Overweight", "Obese")
    ),
    BMI_Category = as.factor(BMI_Category)
  )

df_clean %>% 
  select(BMI, BMI_Category) %>% 
  head(10)
##    BMI BMI_Category
## 1   40        Obese
## 2   25       Normal
## 3   28   Overweight
## 4   27   Overweight
## 5   24       Normal
## 6   25       Normal
## 7   30   Overweight
## 8   25       Normal
## 9   30   Overweight
## 10  24       Normal
table(df_clean$PhysHlth)
## 
##      0      1      2      3      4      5      6      7      8      9     10 
## 136221  11002  14410   8395   4494   7553   1312   4513    805    179   5564 
##     11     12     13     14     15     16     17     18     19     20     21 
##     59    578     68   2577   4895    111     95    150     22   3250    660 
##     22     23     24     25     26     27     28     29     30 
##     70     55     72   1330     69     97    520    213  19286

6.2 Age Group (CDC Mapping)

The Age variable is coded from 1–13 and represents age ranges rather than exact ages. These codes are mapped to meaningful age groups and treated as ordered categories.

df_clean <- df_clean %>%
  mutate(
    AgeGroup = factor(Age,
                      levels = 1:13,
                      labels = c(
                        "18-24", "25-29", "30-34", "35-39", "40-44",
                        "45-49", "50-54", "55-59", "60-64",
                        "65-69", "70-74", "75-79", "80+"
                      ),
                      ordered = TRUE
    )
  )

df_clean %>% 
  select(Age, AgeGroup) %>% 
  head(10)
##    Age AgeGroup
## 1    9    60-64
## 2    7    50-54
## 3    9    60-64
## 4   11    70-74
## 5   11    70-74
## 6   10    65-69
## 7    9    60-64
## 8   11    70-74
## 9    9    60-64
## 10   8    55-59
table(df_clean$AgeGroup)
## 
## 18-24 25-29 30-34 35-39 40-44 45-49 50-54 55-59 60-64 65-69 70-74 75-79   80+ 
##  5488  7022  9986 12157 13982 17219 23007 27174 29599 29023 21933 15301 16734

6.3 Broader Age Bands

Broader age bands are created to simplify interpretation and reduce sparsity when comparing heart disease rates across age.

df_clean <- df_clean %>%
  mutate(
    AgeBand = case_when(
      Age <= 3  ~ "18-34",   # Young adults
      Age <= 6  ~ "35-49",   # Middle age
      Age <= 9  ~ "50-64",   # Older adults
      Age <= 11 ~ "65-74",   # Elderly
      TRUE      ~ "75+"      # Very elderly
    ),
    AgeBand = factor(AgeBand, ordered = TRUE)
  )

df_clean %>% 
  select(Age, AgeBand) %>% 
  head(10)
##    Age AgeBand
## 1    9   50-64
## 2    7   50-64
## 3    9   50-64
## 4   11   65-74
## 5   11   65-74
## 6   10   65-74
## 7    9   50-64
## 8   11   65-74
## 9    9   50-64
## 10   8   50-64
table(df_clean$AgeBand)
## 
## 18-34 35-49 50-64 65-74   75+ 
## 22496 43358 79780 50956 32035

6.4 Lifestyle Risk Score

A composite Lifestyle Risk Score is constructed by combining multiple unhealthy lifestyle behaviours into a single index. Higher scores represent higher lifestyle risk.

df_clean <- df_clean %>%
  mutate(
    RiskScore =
      as.numeric(Smoker) +              # Smoking behavior
      as.numeric(HvyAlcoholConsump) +   # Heavy drinking behavior
      (1 - as.numeric(PhysActivity)) +  # Inactivity
      (1 - as.numeric(Fruits)) +        # Low fruit intake
      (1 - as.numeric(Veggies))         # Low vegetable intake
  )

df_clean %>%
  select(Smoker, HvyAlcoholConsump, PhysActivity, Fruits, Veggies, RiskScore) %>%
  head(10)
##    Smoker HvyAlcoholConsump PhysActivity Fruits Veggies RiskScore
## 1       1                 0            0      0       1         3
## 2       1                 0            1      0       0         3
## 3       0                 0            0      1       0         2
## 4       0                 0            1      1       1         0
## 5       0                 0            1      1       1         0
## 6       1                 0            1      1       1         1
## 7       1                 0            0      0       0         4
## 8       1                 0            1      0       1         2
## 9       1                 0            0      1       1         2
## 10      0                 0            0      0       1         2
summary(df_clean$RiskScore)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   1.000   1.385   2.000   5.000

6.5 Disease Burden Index

Disease burden is measured as the count of chronic conditions (high blood pressure, high cholesterol, diabetes, and stroke history).

df_clean <- df_clean %>%
  mutate(
    DiseaseCount =
      as.numeric(HighBP) +      # High blood pressure
      as.numeric(HighChol) +    # High cholesterol
      as.numeric(Diabetes) +    # Diabetes
      as.numeric(Stroke)        # History of stroke
  )

# Inspect disease burden result
df_clean %>%
  select(HighBP, HighChol, Diabetes, Stroke, DiseaseCount) %>%
  head(10)
##    HighBP HighChol Diabetes Stroke DiseaseCount
## 1       1        1        0      0            2
## 2       0        0        0      0            0
## 3       1        1        0      0            2
## 4       1        0        0      0            1
## 5       1        1        0      0            2
## 6       1        1        0      0            2
## 7       1        0        0      0            1
## 8       1        1        0      0            2
## 9       1        1        2      0            4
## 10      0        0        0      0            0
table(df_clean$DiseaseCount)
## 
##     0     1     2     3     4     5 
## 79409 67121 46035 16094 17780  2186

6.6 Health Stress Index (Mental + Physical Health)

df_clean <- df_clean %>%
  mutate(
    HealthStressIndex = MentHlth + PhysHlth
  )

df_clean %>% 
  select(MentHlth, PhysHlth, HealthStressIndex) %>% 
  head(10)
##    MentHlth PhysHlth HealthStressIndex
## 1        18       15                33
## 2         0        0                 0
## 3        30       30                60
## 4         0        0                 0
## 5         3        0                 3
## 6         0        2                 2
## 7         0       14                14
## 8         0        0                 0
## 9        30       30                60
## 10        0        0                 0
summary(df_clean$HealthStressIndex)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    2.00    8.18   10.00   60.00

6.7 Healthcare Access Score

Healthcare access is summarized using insurance coverage and affordability of doctor visits.

df_clean <- df_clean %>%
  mutate(
    HealthcareScore =
      as.numeric(AnyHealthcare) +       # Has insurance
      (1 - as.numeric(NoDocbcCost))     # Could afford doctor visit
  )

df_clean %>% 
  select(AnyHealthcare, NoDocbcCost, HealthcareScore) %>% 
  head(10)
##    AnyHealthcare NoDocbcCost HealthcareScore
## 1              1           0               2
## 2              0           1               0
## 3              1           1               1
## 4              1           0               2
## 5              1           0               2
## 6              1           0               2
## 7              1           0               2
## 8              1           0               2
## 9              1           0               2
## 10             1           0               2
table(df_clean$HealthcareScore)
## 
##      0      1      2 
##   4556  24432 199637

6.8 Obesity Flag

An obesity indicator is created using the BMI ≥ 30 threshold.

df_clean <- df_clean %>%
  mutate(
    ObeseFlag = ifelse(BMI >= 30, 1, 0),
    ObeseFlag = factor(ObeseFlag)
  )

df_clean %>% 
  select(BMI, ObeseFlag) %>% 
  head(10)
##    BMI ObeseFlag
## 1   40         1
## 2   25         0
## 3   28         0
## 4   27         0
## 5   24         0
## 6   25         0
## 7   30         1
## 8   25         0
## 9   30         1
## 10  24         0
table(df_clean$ObeseFlag)
## 
##      0      1 
## 144111  84514

6.9 Lifestyle Profile Category

RiskScore is converted into an interpretable categorical lifestyle profile (Healthy, ModerateRisk, HighRisk).

df_clean <- df_clean %>%
  mutate(
    LifestyleProfile = case_when(
      RiskScore <= 1 ~ "Healthy",
      RiskScore <= 3 ~ "ModerateRisk",
      TRUE ~ "HighRisk"
    ),
    LifestyleProfile = factor(LifestyleProfile)
  )

df_clean %>% 
  select(RiskScore, LifestyleProfile) %>% 
  head(10)
##    RiskScore LifestyleProfile
## 1          3     ModerateRisk
## 2          3     ModerateRisk
## 3          2     ModerateRisk
## 4          0          Healthy
## 5          0          Healthy
## 6          1          Healthy
## 7          4         HighRisk
## 8          2     ModerateRisk
## 9          2     ModerateRisk
## 10         2     ModerateRisk
table(df_clean$LifestyleProfile)
## 
##      Healthy     HighRisk ModerateRisk 
##       133274         8424        86927

7. Exploratory Data Analysis

Exploratory analysis is conducted to examine the distribution of the target variable and its association with key engineered features (age, sex, BMI category, obesity status, lifestyle profile, and self-reported health).

library(ggplot2)
library(dplyr)
library(scales)

df_plot <- df_clean %>%
  mutate(
    HeartDisease = factor(HeartDiseaseorAttack,
                          levels = c(0, 1),
                          labels = c("No", "Yes")),
    SexLabel = factor(Sex,
                      levels = c(0, 1),
                      labels = c("Female", "Male"))
  )

7.1 Heart Disease Prevalence

This plot summarizes the overall prevalence of heart disease/attack in the dataset.

df_donut <- df_plot %>%
  count(HeartDisease) %>%
  mutate(prop = n / sum(n))

ggplot(df_donut, aes(x = 2, y = prop, fill = HeartDisease)) +
  geom_col(width = 1) +
  coord_polar(theta = "y") +
  xlim(0.5, 2.5) +
  theme_void() +
  labs(title = "Heart Disease Prevalence (Donut Chart)") +
  geom_text(aes(label = percent(prop)), 
            position = position_stack(vjust = 0.5))

#### Observation: The donut chart shows that approximately 10% of individuals in the dataset have experienced heart disease or attack, while 90% have not.

Conclusion:

Heart disease cases form a minority class, indicating a class-imbalanced dataset, which should be considered during model training and evaluation.

7.2 Heart Disease Rate by Age Band

Heart disease proportion is compared across age bands to assess how prevalence changes with age.

ggplot(df_plot, aes(x = AgeBand, fill = HeartDisease)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = percent) +
  labs(title = "Heart Disease Rate by Age Band",
       x = "Age Band",
       y = "Proportion") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#### Observation: The proportion of heart disease increases steadily with age, with the lowest prevalence in the 18–34 group and the highest in the 75+ group.

Conclusion:

Age is a strong risk factor for heart disease, and older age bands are significantly more vulnerable, highlighting the importance of age in predictive modelling.

7.3 Heart Disease Rate by Sex

This plot compares heart disease prevalence between male and female respondents.

ggplot(df_plot, aes(x = SexLabel, fill = HeartDisease)) +                       # ok
  geom_bar(position = "fill", color = "white") +
  scale_fill_manual(values = c("#F76C6C", "#1ECBE1")) +
  scale_y_continuous(labels = percent) +
  labs(title = "Heart Disease Rate by Sex",
       x = "Sex",
       y = "Proportion") +
  theme_minimal(base_size = 14)

#### Observation: Males exhibit a higher proportion of heart disease cases compared to females.

Conclusion:

Sex appears to be an important demographic factor, with males showing greater susceptibility to heart disease in this dataset.

7.4 Heart Disease Rate by BMI Category

Heart disease prevalence is compared across BMI categories.

ggplot(df_plot, aes(BMI_Category, fill = HeartDisease)) +
  geom_bar(position = "fill", color = "white") +
  scale_fill_manual(values = c("#F9A825", "#1976D2")) +
  labs(title = "Heart Disease Rate by BMI Category",
       x = "BMI Category",
       y = "Proportion") +
  scale_y_continuous(labels = percent) +
  theme_minimal(base_size = 14)

#### Observation: Heart disease prevalence is lowest in the Normal BMI group and higher among Overweight and Obese individuals, with Obese showing the highest proportion.

Conclusion:

Higher BMI is associated with an increased risk of heart disease, indicating BMI as an important predictive health indicator.

7.5 Heart Disease Rate by Obesity Status

This plot compares heart disease prevalence between obese and non-obese individuals.

ggplot(df_plot, aes(factor(ObeseFlag), fill = HeartDisease)) +                  # ok
  geom_bar(position = "fill", width = 0.5) +
  scale_fill_manual(values = c("#81D4FA", "#EF5350")) +
  labs(title = "Heart Disease Rate by Obesity Status",
       x = "Obesity (0 = No, 1 = Yes)",
       y = "Proportion") +
  scale_y_continuous(labels = percent) +
  theme_minimal(base_size = 14)

#### Observation: Obese individuals (Obesity = 1) exhibit a higher proportion of heart disease compared to non-obese individuals.

Conclusion:

Obesity status is positively associated with heart disease prevalence, reinforcing its role as a key risk factor.

7.6 Heart Disease Rate by Lifestyle Profile

This plot evaluates heart disease prevalence across lifestyle risk profiles.

ggplot(df_plot, aes(LifestyleProfile, fill = HeartDisease)) +
  geom_bar(position = "fill", color = "white") +
  scale_fill_manual(values = c("#43A047", "#EF5350")) +
  scale_y_continuous(labels = percent) +
  labs(title = "Heart Disease Rate by Lifestyle Profile",
       x = "Lifestyle Category",
       y = "Proportion") +
  theme_minimal(base_size = 14)

#### Observation: The *High-Risk lifestyle group** shows the highest heart disease prevalence, followed by the Moderate-Risk group, while the Healthy group has the lowest.

Conclusion:

Unhealthy lifestyle behaviours significantly increase heart disease risk, highlighting the impact of combined lifestyle factors.

7.7 Heart Disease Rate by Self-Reported General Health

Self-reported general health is compared against heart disease prevalence.

ggplot(df_plot, aes(x = GenHlth_Factor, fill = HeartDisease)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = percent) +
  labs(title = "Heart Disease Rate by Self-Reported General Health",
       x = "General Health",
       y = "Proportion") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#### Observation: Heart disease prevalence increases sharply as self-reported general health declines from Excellent to Poor.

Conclusion:

Self-reported general health is a strong indicator of heart disease risk and reflects overall physical well-being.

7.8 Health Stress Index by Heart Disease Status

This boxplot compares the distribution of combined physical and mental unhealthy days between individuals with and without heart disease.

ggplot(df_plot, aes(x = HeartDisease, y = HealthStressIndex, fill = HeartDisease)) +
  geom_boxplot() +
  labs(title = "Health Stress Index by Heart Disease Status",
       x = "Heart Disease or Attack",
       y = "HealthStressIndex")

Observation: Individuals with heart disease have a higher median Health Stress Index and greater variability compared to those without heart disease.

Conclusion: Poor physical and mental health (higher stress burden) is strongly associated with heart disease presence.

8. Correlation Analysis

Correlation analysis is conducted to examine linear relationships among features and to support feature selection for both classification and regression tasks. Since correlation requires numeric inputs, categorical variables are converted into numeric codes. This analysis is exploratory and does not imply causation.

library(corrplot)
library(dplyr)

8.1 Preparation of Numeric Feature Matrix

All factor variables are converted to numeric codes, and the binary target variable is encoded as 0/1 for correlation computation.

df_corr <- df_clean %>%
  mutate(
    HeartDisease_num = as.numeric(as.character(HeartDiseaseorAttack))
  ) %>%
  select(-HeartDiseaseorAttack) %>%       
  mutate(
    across(where(is.factor), ~ as.numeric(.))   
  )

cat("Number of features used for correlation:", ncol(df_corr), "\n")
## Number of features used for correlation: 33

8.2 Correlation Matrix Computation

The correlation matrix is computed using pairwise complete observations to handle any remaining missing values safely.

cor_mat <- cor(df_corr, use = "pairwise.complete.obs")

dim(cor_mat)
## [1] 33 33

8.3 Correlation Heatmap

The heatmap below visualizes the upper triangle of the correlation matrix to reduce redundancy and improve readability.

cor_long <- cor_mat %>%
  as.data.frame() %>%
  mutate(Var1 = rownames(.)) %>%
  pivot_longer(-Var1, names_to = "Var2", values_to = "value")

ggplot(cor_long, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white", size = 0.2) +
  scale_fill_gradient2(
    low = "#6BAED6", mid = "white", high = "#DE2D26",
    midpoint = 0, limit = c(-1, 1), name = "Correlation"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    axis.text.x = element_text(angle = 60, vjust = 1, hjust = 1),
    panel.grid = element_blank()
  ) +
  labs(
    title = "Correlation Heatmap (ggplot2)",
    x = "",
    y = ""
  )

8.4 Feature Correlation with BMI (Regression Target)

This section examines correlations between BMI and other features to support regression feature selection.

if ("BMI" %in% rownames(cor_mat)) {
  bmi_cor <- sort(cor_mat["BMI", ], decreasing = TRUE)
  cat("\nCorrelation with BMI:\n")
  print(round(bmi_cor, 3))
} else {
  warning("BMI not found in correlation matrix row names.")
}
## 
## Correlation with BMI:
##               BMI        BMI_capped      BMI_Category         ObeseFlag 
##             1.000             0.971             0.808             0.740 
##      DiseaseCount          Diabetes           GenHlth    GenHlth_Factor 
##             0.234             0.212             0.208             0.208 
##            HighBP          DiffWalk HealthStressIndex          PhysHlth 
##             0.194             0.183             0.106             0.103 
##          HighChol         RiskScore  LifestyleProfile          MentHlth 
##             0.090             0.082             0.074             0.069 
##       NoDocbcCost         CholCheck  HeartDisease_num               Sex 
##             0.046             0.042             0.040             0.031 
##            Stroke     AnyHealthcare            Smoker   HealthcareScore 
##             0.011            -0.009            -0.009            -0.038 
##           Veggies               Age          AgeGroup HvyAlcoholConsump 
##            -0.044            -0.050            -0.050            -0.058 
##           AgeBand            Fruits            Income         Education 
##            -0.060            -0.068            -0.069            -0.075 
##      PhysActivity 
##            -0.128

8.5 Feature Correlation with Heart Disease (Classification Target)

This section examines correlations between the binary heart disease outcome and other features.

if ("HeartDisease_num" %in% rownames(cor_mat)) {
  hd_cor <- sort(cor_mat["HeartDisease_num", ], decreasing = TRUE)
  cat("\nCorrelation with HeartDiseaseorAttack (0/1):\n")
  print(round(hd_cor, 3))
} else {
  warning("HeartDisease_num not found in correlation matrix row names.")
}
## 
## Correlation with HeartDiseaseorAttack (0/1):
##  HeartDisease_num      DiseaseCount           GenHlth    GenHlth_Factor 
##             1.000             0.278             0.246             0.246 
##               Age          AgeGroup           AgeBand          DiffWalk 
##             0.223             0.223             0.221             0.203 
##            HighBP            Stroke          HighChol          Diabetes 
##             0.201             0.199             0.176             0.171 
##          PhysHlth HealthStressIndex            Smoker               Sex 
##             0.171             0.142             0.105             0.090 
##         RiskScore  LifestyleProfile          MentHlth         CholCheck 
##             0.083             0.064             0.053             0.050 
##      BMI_Category        BMI_capped               BMI         ObeseFlag 
##             0.045             0.045             0.040             0.039 
##     AnyHealthcare       NoDocbcCost   HealthcareScore            Fruits 
##             0.026             0.022            -0.001            -0.007 
##           Veggies HvyAlcoholConsump      PhysActivity         Education 
##            -0.027            -0.035            -0.073            -0.083 
##            Income 
##            -0.123

9. Dataset Construction and Partitioning

This section prepares two modelling datasets: (1) a classification dataset for predicting heart disease occurrence and (2) a regression dataset for predicting BMI. Features are selected based on exploratory correlation screening and domain knowledge.

9.1 Feature Selection for Classification and Regression

For classification, a set of demographic, health condition, and engineered lifestyle features is selected. For regression, BMI is treated as the target variable and additional socioeconomic factors (income and education) are included.

library(dplyr)

# -----------------------------
# Classification features
# -----------------------------
# NOTE: MentHlth and PhysHlth are included once only (avoid duplicates).
clf_features <- c(
  "HeartDiseaseorAttack",  # target
  "Age", "AgeGroup", "AgeBand",
  "Sex",
  "HighBP", "HighChol", "Diabetes", "Stroke",
  "Smoker", "PhysActivity",
  "MentHlth", "PhysHlth", "HealthStressIndex",
  "DiseaseCount", "ObeseFlag",
  "RiskScore", "LifestyleProfile",
  "BMI"
)

df_clf_raw <- df_clean %>% select(all_of(clf_features))

# -----------------------------
# Regression features
# -----------------------------
reg_features <- c(
  "BMI",                   # target
  "Age", "AgeGroup", "AgeBand",
  "Sex",
  "HighBP", "HighChol", "Diabetes", "Stroke",
  "Smoker", "PhysActivity",
  "MentHlth", "PhysHlth", "HealthStressIndex",
  "DiseaseCount", "RiskScore", "LifestyleProfile",
  "Income", "Education"
)

df_reg_raw <- df_clean %>% select(all_of(reg_features))

9.2 Train/Validation/Test Split

A 60/20/20 split is applied to support model tuning (validation set) and unbiased evaluation (test set). A fixed random seed is used for reproducibility.

set.seed(123)

n <- nrow(df_clean)
indices <- sample(seq_len(n))

train_size <- floor(0.6 * n)
valid_size <- floor(0.2 * n)

train_idx <- indices[1:train_size]
valid_idx <- indices[(train_size + 1):(train_size + valid_size)]
test_idx  <- indices[(train_size + valid_size + 1):n]

cat("\nTrain:", length(train_idx),
    "\nValid:", length(valid_idx),
    "\nTest:",  length(test_idx), "\n")
## 
## Train: 137175 
## Valid: 45725 
## Test: 45725

9.3 Build Classification and Regression Datasets

The classification target is converted into a factor (“No”/“Yes”) for modelling workflows such as SMOTE, while regression retains BMI as a numeric outcome.

# ---------------------
# Classification split
# ---------------------
train_clf <- df_clf_raw[train_idx, ]
valid_clf <- df_clf_raw[valid_idx, ]
test_clf  <- df_clf_raw[test_idx, ]

train_clf$HeartDisease <- factor(train_clf$HeartDiseaseorAttack, levels=c(0,1), labels=c("No","Yes"))
valid_clf$HeartDisease <- factor(valid_clf$HeartDiseaseorAttack, levels=c(0,1), labels=c("No","Yes"))
test_clf$HeartDisease  <- factor(test_clf$HeartDiseaseorAttack, levels=c(0,1), labels=c("No","Yes"))

train_clf <- train_clf %>% select(-HeartDiseaseorAttack) %>% relocate(HeartDisease)
valid_clf <- valid_clf %>% select(-HeartDiseaseorAttack) %>% relocate(HeartDisease)
test_clf  <- test_clf  %>% select(-HeartDiseaseorAttack) %>% relocate(HeartDisease)

# ------------------
# Regression split
# ------------------
train_reg <- df_reg_raw[train_idx, ]
valid_reg <- df_reg_raw[valid_idx, ]
test_reg  <- df_reg_raw[test_idx, ]

10. Normalization (Train-only Fitting)

Selected numeric variables are standardised using mean and standard deviation estimated from the training set only. The learned scaling parameters are then applied to validation and test sets to prevent data leakage.

# --------------------
# Variables to Scale
# --------------------
scale_cols <- c("BMI", "MentHlth", "PhysHlth")

# Fit scaler on TRAIN ONLY (classification train still contains these columns before dropping)
scaler_means <- sapply(train_clf[, scale_cols], mean)
scaler_sds   <- sapply(train_clf[, scale_cols], sd)

scale_apply <- function(df, cols, means, sds) {
  df[, paste0(cols, "_scaled")] <- sweep(df[, cols], 2, means, FUN = "-") / sds
  return(df)
}

# -----------------
# Apply Scaling
# -----------------
train_clf <- scale_apply(train_clf, scale_cols, scaler_means, scaler_sds)
valid_clf <- scale_apply(valid_clf, scale_cols, scaler_means, scaler_sds)
test_clf  <- scale_apply(test_clf,  scale_cols, scaler_means, scaler_sds)

train_reg <- scale_apply(train_reg, scale_cols, scaler_means, scaler_sds)
valid_reg <- scale_apply(valid_reg, scale_cols, scaler_means, scaler_sds)
test_reg  <- scale_apply(test_reg,  scale_cols, scaler_means, scaler_sds)

10.1 Drop Unscaled Versions Where Appropriate

For classification, unscaled BMI/MentHlth/PhysHlth are removed after scaling. For regression, BMI is retained as the target; the scaled BMI column is removed.

# -----------------------------------------
# Classification: drop unscaled originals
# -----------------------------------------
train_clf <- train_clf %>% select(-BMI, -MentHlth, -PhysHlth)
valid_clf <- valid_clf %>% select(-BMI, -MentHlth, -PhysHlth)
test_clf  <- test_clf  %>% select(-BMI, -MentHlth, -PhysHlth)

# ----------------------------------------------------------------------
# Regression: keep BMI target, drop BMI_scaled and unscaled Ment/Phys
# ----------------------------------------------------------------------
train_reg <- train_reg %>% select(-BMI_scaled,-MentHlth, -PhysHlth)
valid_reg <- valid_reg %>% select(-BMI_scaled,-MentHlth, -PhysHlth)
test_reg  <- test_reg  %>% select(-BMI_scaled,-MentHlth, -PhysHlth)

cat("\n==== FINAL DATASET SHAPES ==== \n")
## 
## ==== FINAL DATASET SHAPES ====
cat("\n[Classification]\n")
## 
## [Classification]
cat("Train:", nrow(train_clf), "rows,", ncol(train_clf), "columns\n")
## Train: 137175 rows, 19 columns
cat("Valid:", nrow(valid_clf), "rows,", ncol(valid_clf), "columns\n")
## Valid: 45725 rows, 19 columns
cat("Test :", nrow(test_clf),  "rows,", ncol(test_clf),  "columns\n")
## Test : 45725 rows, 19 columns
cat("\n[Regression]\n")
## 
## [Regression]
cat("Train:", nrow(train_reg), "rows,", ncol(train_reg), "columns\n")
## Train: 137175 rows, 19 columns
cat("Valid:", nrow(valid_reg), "rows,", ncol(valid_reg), "columns\n")
## Valid: 45725 rows, 19 columns
cat("Test :", nrow(test_reg),  "rows,", ncol(test_reg),  "columns\n")
## Test : 45725 rows, 19 columns

11. Modelling

This section develops models for both tasks. For classification, two ensemble approaches are compared: LightGBM trained on SMOTE-balanced data and XGBoost trained on original data with class weighting. For regression, a Random Forest (Ranger) model and an XGBoost regression model are trained and evaluated using standard regression metrics.

11A. Classification Modelling

11A.1 Modelling Strategy

The classification pipeline uses two imbalance-handling strategies: - SMOTE is applied to the training data for LightGBM to reduce class imbalance. - Class weighting (scale_pos_weight) is used for XGBoost without SMOTE to preserve the original data distribution.

library(lightgbm)
library(ParBayesianOptimization)
library(Matrix)
library(smotefamily)
library(pROC)
library(caret)
library(xgboost)

11A.2 Data Preparation for Modelling

The response variable is converted into numeric form (0/1) for model training. All predictors are one-hot encoded usingmodel.matrix().

train_y_num <- ifelse(train_clf$HeartDisease == "Yes", 1, 0)
valid_y_num <- ifelse(valid_clf$HeartDisease == "Yes", 1, 0)
test_y_num  <- ifelse(test_clf$HeartDisease  == "Yes", 1, 0)

X_train <- model.matrix(HeartDisease ~ ., train_clf)[, -1]
X_valid <- model.matrix(HeartDisease ~ ., valid_clf)[, -1]
X_test  <- model.matrix(HeartDisease ~ ., test_clf)[, -1]

11A.3 LightGBM with SMOTE + Bayesian Optimization

SMOTE is applied to the training data only. Bayesian Optimization is used to tune key LightGBM hyperparameters using validation AUC.

11A.3.1 Apply SMOTE (Training Set Only)

SMOTE is applied only to the training data to reduce class imbalance without leaking information into validation or test sets.

cat("\n--- Running SMOTE on training data ---\n")
## 
## --- Running SMOTE on training data ---
df_smote_X <- as.data.frame(X_train)
df_smote_y <- factor(train_y_num, levels = c(0, 1))

smote_out <- SMOTE(
  df_smote_X,
  df_smote_y,
  K = 5,
  dup_size = 3
)

X_train_smote <- as.matrix(smote_out$data[, -ncol(smote_out$data)])
y_train_smote <- as.numeric(as.character(smote_out$data$class))

cat("\nOriginal distribution:\n")
## 
## Original distribution:
print(table(train_y_num))
## train_y_num
##      0      1 
## 123042  14133
cat("\nAfter SMOTE:\n")
## 
## After SMOTE:
print(table(y_train_smote))
## y_train_smote
##      0      1 
## 123042  56532
11A.3.2 Create LightGBM Datasets

LightGBM datasets are created for training and validation in preparation for hyperparameter tuning.

dtrain <- lgb.Dataset(
  data = X_train_smote,
  label = y_train_smote
)

dvalid <- lgb.Dataset.create.valid(
  dtrain,
  data = as.matrix(X_valid),
  label = valid_y_num
)
11A.3.3 Bayesian Optimization for LightGBM (AUC)

Bayesian Optimization is used to tune key LightGBM hyperparameters. The objective is to maximize validation AUC with early stopping.

lgb_bayes <- function(learning_rate, num_leaves, feature_fraction,
                      bagging_fraction, min_data_in_leaf) {
  
  params <- list(
    objective = "binary",
    metric = "auc",
    learning_rate = learning_rate,
    num_leaves = as.integer(num_leaves),
    feature_fraction = feature_fraction,
    bagging_fraction = bagging_fraction,
    bagging_freq = 5,
    min_data_in_leaf = as.integer(min_data_in_leaf),
    
    # IMPORTANT FIX
    feature_pre_filter = FALSE,
    force_row_wise = TRUE,    
    verbosity = -1
  )
  
  model <- lgb.train(
    params = params,
    data = dtrain,
    valids = list(valid = dvalid),
    nrounds = 500,
    early_stopping_rounds = 30,
    verbose = -1
  )
  
  best_auc <- max(unlist(model$record_evals$valid$auc$eval))
  return(list(Score = best_auc))
}

set.seed(123)

opt_results <- bayesOpt(
  FUN = lgb_bayes,
  bounds = list(
    learning_rate = c(0.01, 0.2),
    num_leaves = c(20L, 80L),
    feature_fraction = c(0.6, 1.0),
    bagging_fraction = c(0.6, 1.0),
    min_data_in_leaf = c(10L, 80L)
  ),
  initPoints = 8,   
  iters.n = 20,
  acq = "ucb",
  kappa = 2.5,
  parallel = FALSE,
  verbose = TRUE
)
## 
## Running initial scoring function 8 times in 1 thread(s)...
##         9.66 seconds
##   3) Running FUN 1 times in 1 thread(s)...
cat("\n========== BEST BAYESIAN OPT PARAMETERS ==========\n")
## 
## ========== BEST BAYESIAN OPT PARAMETERS ==========
print(getBestPars(opt_results))
## $learning_rate
## [1] 0.08516086
## 
## $num_leaves
## [1] 20
## 
## $feature_fraction
## [1] 0.8480176
## 
## $bagging_fraction
## [1] 0.6
## 
## $min_data_in_leaf
## [1] 54
best_params <- getBestPars(opt_results)
11A.3.4 Train Final LightGBM and Evaluate on Test Set

The final model is trained using optimized hyperparameters and evaluated on the test set using Accuracy, Precision, Recall, F1-score, and AUC.

final_params <- list(
  objective = "binary",
  metric = "auc",
  learning_rate = best_params$learning_rate,
  num_leaves = as.integer(best_params$num_leaves),
  feature_fraction = best_params$feature_fraction,
  bagging_fraction = best_params$bagging_fraction,
  bagging_freq = 5,
  min_data_in_leaf = as.integer(best_params$min_data_in_leaf),
  feature_pre_filter = FALSE,
  force_row_wise = TRUE,
  verbosity = -1
)

final_model <- lgb.train(
  params = final_params,
  data = dtrain,
  valids = list(valid = dvalid),
  nrounds = 700,
  early_stopping_rounds = 40,
  verbose = 1
)

pred_prob <- predict(final_model, as.matrix(X_test))
pred <- ifelse(pred_prob > 0.5, 1, 0)

pred_factor <- factor(pred, levels = c(0, 1))
test_factor <- factor(test_y_num, levels = c(0, 1))

conf <- table(Predicted = pred_factor, Actual = test_factor)

acc <- mean(pred == test_y_num)
prec <- caret::precision(pred_factor, test_factor)
rec <- caret::recall(pred_factor, test_factor)
f1 <- caret::F_meas(pred_factor, test_factor)
auc <- auc(test_y_num, pred_prob)

cat("\n================ FINAL LIGHTGBM PERFORMANCE ================\n")
## 
## ================ FINAL LIGHTGBM PERFORMANCE ================
print(conf)
##          Actual
## Predicted     0     1
##         0 40109  4084
##         1   814   718
cat(sprintf(
  "\nAccuracy: %.4f\nPrecision: %.4f\nRecall: %.4f\nF1 Score: %.4f\nAUC: %.4f\n",
  acc, prec, rec, f1, auc
))
## 
## Accuracy: 0.8929
## Precision: 0.9076
## Recall: 0.9801
## F1 Score: 0.9425
## AUC: 0.8218

11A.4 XGBoost with Class Weighting + Bayesian Optimization

XGBoost is tuned using Bayesian Optimization, using class weighting to address imbalance without resampling.

11A.4.1 Prepare DMatrix and Class Weight

XGBoost is trained using the original (unbalanced) training data. Class imbalance is handled using scale_pos_weight.

dtrain_xgb <- xgb.DMatrix(data = X_train, label = train_y_num)
dvalid_xgb <- xgb.DMatrix(data = X_valid, label = valid_y_num)
dtest_xgb  <- xgb.DMatrix(data = X_test,  label = test_y_num)

scale_pos_weight_val <- sum(train_y_num == 0) / sum(train_y_num == 1)
scale_pos_weight_val
## [1] 8.706007
11A.4.2 Bayesian Optimization for XGBoost (AUC)

Bayesian Optimization is used to tune XGBoost hyperparameters by maximizing validation AUC.

xgb_bayes <- function(eta, max_depth, subsample,
                      colsample_bytree, min_child_weight) {
  
  params <- list(
    objective = "binary:logistic",
    eval_metric = "auc",
    eta = eta,
    max_depth = as.integer(max_depth),
    subsample = subsample,
    colsample_bytree = colsample_bytree,
    min_child_weight = min_child_weight,
    scale_pos_weight = scale_pos_weight_val,
    tree_method = "hist",
    booster = "gbtree"
  )
  
  model <- tryCatch({
    xgb.train(
      params = params,
      data = dtrain_xgb,
      nrounds = 500,
      watchlist = list(valid = dvalid_xgb),
      early_stopping_rounds = 20,
      verbose = 0
    )
  }, error = function(e) NULL)
  
  #  HARD FAIL SAFE
  if (is.null(model)) {
    return(list(Score = 0.5))
  }
  
  best_auc <- suppressWarnings(
    max(model$evaluation_log$valid_auc, na.rm = TRUE)
  )
  
  # Prevent NA / Inf / constant collapse
  if (!is.finite(best_auc)) {
    best_auc <- 0.5
  }
  
  # Add microscopic noise to prevent GP zero-variance
  best_auc <- best_auc + runif(1, 0, 1e-6)
  
  return(list(Score = best_auc))
}

set.seed(123)

opt_xgb <- bayesOpt(
  FUN = xgb_bayes,
  bounds = list(
    eta = c(0.01, 0.2),
    max_depth = c(3L, 10L),
    subsample = c(0.7, 1.0),
    colsample_bytree = c(0.7, 1.0),
    min_child_weight = c(1, 10)
  ),
  initPoints = 12,     
  iters.n = 20,
  acq = "ucb",
  kappa = 2.5,
  verbose = TRUE
)
## 
## Running initial scoring function 12 times in 1 thread(s)...
best_xgb_params <- getBestPars(opt_xgb)
cat("\nBest XGB Params:\n")
## 
## Best XGB Params:
print(best_xgb_params)
## $eta
## [1] 0.02104175
## 
## $max_depth
## [1] 5
## 
## $subsample
## [1] 0.7285954
## 
## $colsample_bytree
## [1] 0.7966621
## 
## $min_child_weight
## [1] 2.903258
11A.4.3 Train Final XGBoost and Evaluate on Test Set

The final tuned XGBoost model is trained and evaluated using the same classification metrics.

final_xgb_params <- list(
  objective = "binary:logistic",
  eval_metric = "auc",
  eta = best_xgb_params$eta,
  max_depth = as.integer(best_xgb_params$max_depth),
  subsample = best_xgb_params$subsample,
  colsample_bytree = best_xgb_params$colsample_bytree,
  min_child_weight = best_xgb_params$min_child_weight,
  scale_pos_weight = scale_pos_weight_val,
  tree_method = "hist",
  booster = "gbtree"
)

xgb_clf <- xgb.train(
  params = final_xgb_params,
  data = dtrain_xgb,
  nrounds = 700,
  watchlist = list(valid = dvalid_xgb),
  early_stopping_rounds = 30,
  verbose = 1
)
## [1]  valid-auc:0.786557 
## Will train until valid_auc hasn't improved in 30 rounds.
## 
## [2]  valid-auc:0.809775 
## [3]  valid-auc:0.813197 
## [4]  valid-auc:0.815307 
## [5]  valid-auc:0.816428 
## [6]  valid-auc:0.816590 
## [7]  valid-auc:0.815620 
## [8]  valid-auc:0.815315 
## [9]  valid-auc:0.816220 
## [10] valid-auc:0.818043 
## [11] valid-auc:0.818553 
## [12] valid-auc:0.819211 
## [13] valid-auc:0.819434 
## [14] valid-auc:0.819427 
## [15] valid-auc:0.819460 
## [16] valid-auc:0.819856 
## [17] valid-auc:0.820071 
## [18] valid-auc:0.820679 
## [19] valid-auc:0.820826 
## [20] valid-auc:0.820902 
## [21] valid-auc:0.821009 
## [22] valid-auc:0.821158 
## [23] valid-auc:0.821121 
## [24] valid-auc:0.821341 
## [25] valid-auc:0.821540 
## [26] valid-auc:0.821547 
## [27] valid-auc:0.821668 
## [28] valid-auc:0.821620 
## [29] valid-auc:0.821525 
## [30] valid-auc:0.821633 
## [31] valid-auc:0.821746 
## [32] valid-auc:0.821865 
## [33] valid-auc:0.822057 
## [34] valid-auc:0.822210 
## [35] valid-auc:0.822308 
## [36] valid-auc:0.822403 
## [37] valid-auc:0.822524 
## [38] valid-auc:0.822676 
## [39] valid-auc:0.822801 
## [40] valid-auc:0.822834 
## [41] valid-auc:0.822938 
## [42] valid-auc:0.822936 
## [43] valid-auc:0.823002 
## [44] valid-auc:0.823104 
## [45] valid-auc:0.823086 
## [46] valid-auc:0.823149 
## [47] valid-auc:0.823153 
## [48] valid-auc:0.823212 
## [49] valid-auc:0.823248 
## [50] valid-auc:0.823429 
## [51] valid-auc:0.823490 
## [52] valid-auc:0.823483 
## [53] valid-auc:0.823581 
## [54] valid-auc:0.823629 
## [55] valid-auc:0.823646 
## [56] valid-auc:0.823751 
## [57] valid-auc:0.823821 
## [58] valid-auc:0.823854 
## [59] valid-auc:0.823955 
## [60] valid-auc:0.824017 
## [61] valid-auc:0.824132 
## [62] valid-auc:0.824182 
## [63] valid-auc:0.824185 
## [64] valid-auc:0.824243 
## [65] valid-auc:0.824318 
## [66] valid-auc:0.824412 
## [67] valid-auc:0.824478 
## [68] valid-auc:0.824629 
## [69] valid-auc:0.824659 
## [70] valid-auc:0.824749 
## [71] valid-auc:0.824800 
## [72] valid-auc:0.824848 
## [73] valid-auc:0.824906 
## [74] valid-auc:0.824935 
## [75] valid-auc:0.825057 
## [76] valid-auc:0.825081 
## [77] valid-auc:0.825175 
## [78] valid-auc:0.825203 
## [79] valid-auc:0.825246 
## [80] valid-auc:0.825285 
## [81] valid-auc:0.825358 
## [82] valid-auc:0.825392 
## [83] valid-auc:0.825413 
## [84] valid-auc:0.825472 
## [85] valid-auc:0.825511 
## [86] valid-auc:0.825567 
## [87] valid-auc:0.825634 
## [88] valid-auc:0.825666 
## [89] valid-auc:0.825705 
## [90] valid-auc:0.825677 
## [91] valid-auc:0.825731 
## [92] valid-auc:0.825747 
## [93] valid-auc:0.825742 
## [94] valid-auc:0.825748 
## [95] valid-auc:0.825833 
## [96] valid-auc:0.825857 
## [97] valid-auc:0.825880 
## [98] valid-auc:0.825901 
## [99] valid-auc:0.825935 
## [100]    valid-auc:0.825966 
## [101]    valid-auc:0.825994 
## [102]    valid-auc:0.826045 
## [103]    valid-auc:0.826077 
## [104]    valid-auc:0.826099 
## [105]    valid-auc:0.826140 
## [106]    valid-auc:0.826157 
## [107]    valid-auc:0.826179 
## [108]    valid-auc:0.826188 
## [109]    valid-auc:0.826213 
## [110]    valid-auc:0.826198 
## [111]    valid-auc:0.826215 
## [112]    valid-auc:0.826247 
## [113]    valid-auc:0.826267 
## [114]    valid-auc:0.826294 
## [115]    valid-auc:0.826327 
## [116]    valid-auc:0.826336 
## [117]    valid-auc:0.826361 
## [118]    valid-auc:0.826399 
## [119]    valid-auc:0.826425 
## [120]    valid-auc:0.826458 
## [121]    valid-auc:0.826462 
## [122]    valid-auc:0.826475 
## [123]    valid-auc:0.826461 
## [124]    valid-auc:0.826473 
## [125]    valid-auc:0.826507 
## [126]    valid-auc:0.826519 
## [127]    valid-auc:0.826514 
## [128]    valid-auc:0.826515 
## [129]    valid-auc:0.826541 
## [130]    valid-auc:0.826578 
## [131]    valid-auc:0.826594 
## [132]    valid-auc:0.826597 
## [133]    valid-auc:0.826607 
## [134]    valid-auc:0.826631 
## [135]    valid-auc:0.826625 
## [136]    valid-auc:0.826636 
## [137]    valid-auc:0.826673 
## [138]    valid-auc:0.826677 
## [139]    valid-auc:0.826664 
## [140]    valid-auc:0.826684 
## [141]    valid-auc:0.826700 
## [142]    valid-auc:0.826723 
## [143]    valid-auc:0.826763 
## [144]    valid-auc:0.826764 
## [145]    valid-auc:0.826789 
## [146]    valid-auc:0.826815 
## [147]    valid-auc:0.826830 
## [148]    valid-auc:0.826824 
## [149]    valid-auc:0.826833 
## [150]    valid-auc:0.826851 
## [151]    valid-auc:0.826842 
## [152]    valid-auc:0.826859 
## [153]    valid-auc:0.826860 
## [154]    valid-auc:0.826876 
## [155]    valid-auc:0.826897 
## [156]    valid-auc:0.826916 
## [157]    valid-auc:0.826911 
## [158]    valid-auc:0.826930 
## [159]    valid-auc:0.826959 
## [160]    valid-auc:0.826973 
## [161]    valid-auc:0.826984 
## [162]    valid-auc:0.826983 
## [163]    valid-auc:0.826976 
## [164]    valid-auc:0.826973 
## [165]    valid-auc:0.826982 
## [166]    valid-auc:0.826996 
## [167]    valid-auc:0.827004 
## [168]    valid-auc:0.826991 
## [169]    valid-auc:0.827001 
## [170]    valid-auc:0.827011 
## [171]    valid-auc:0.827034 
## [172]    valid-auc:0.827034 
## [173]    valid-auc:0.827036 
## [174]    valid-auc:0.827045 
## [175]    valid-auc:0.827063 
## [176]    valid-auc:0.827071 
## [177]    valid-auc:0.827057 
## [178]    valid-auc:0.827061 
## [179]    valid-auc:0.827058 
## [180]    valid-auc:0.827076 
## [181]    valid-auc:0.827075 
## [182]    valid-auc:0.827092 
## [183]    valid-auc:0.827087 
## [184]    valid-auc:0.827099 
## [185]    valid-auc:0.827106 
## [186]    valid-auc:0.827113 
## [187]    valid-auc:0.827112 
## [188]    valid-auc:0.827128 
## [189]    valid-auc:0.827110 
## [190]    valid-auc:0.827107 
## [191]    valid-auc:0.827105 
## [192]    valid-auc:0.827105 
## [193]    valid-auc:0.827116 
## [194]    valid-auc:0.827117 
## [195]    valid-auc:0.827125 
## [196]    valid-auc:0.827135 
## [197]    valid-auc:0.827137 
## [198]    valid-auc:0.827157 
## [199]    valid-auc:0.827175 
## [200]    valid-auc:0.827183 
## [201]    valid-auc:0.827190 
## [202]    valid-auc:0.827192 
## [203]    valid-auc:0.827199 
## [204]    valid-auc:0.827203 
## [205]    valid-auc:0.827212 
## [206]    valid-auc:0.827219 
## [207]    valid-auc:0.827220 
## [208]    valid-auc:0.827216 
## [209]    valid-auc:0.827232 
## [210]    valid-auc:0.827245 
## [211]    valid-auc:0.827250 
## [212]    valid-auc:0.827255 
## [213]    valid-auc:0.827263 
## [214]    valid-auc:0.827259 
## [215]    valid-auc:0.827252 
## [216]    valid-auc:0.827249 
## [217]    valid-auc:0.827244 
## [218]    valid-auc:0.827250 
## [219]    valid-auc:0.827247 
## [220]    valid-auc:0.827256 
## [221]    valid-auc:0.827261 
## [222]    valid-auc:0.827275 
## [223]    valid-auc:0.827269 
## [224]    valid-auc:0.827264 
## [225]    valid-auc:0.827274 
## [226]    valid-auc:0.827282 
## [227]    valid-auc:0.827262 
## [228]    valid-auc:0.827265 
## [229]    valid-auc:0.827246 
## [230]    valid-auc:0.827246 
## [231]    valid-auc:0.827249 
## [232]    valid-auc:0.827246 
## [233]    valid-auc:0.827240 
## [234]    valid-auc:0.827233 
## [235]    valid-auc:0.827243 
## [236]    valid-auc:0.827264 
## [237]    valid-auc:0.827261 
## [238]    valid-auc:0.827250 
## [239]    valid-auc:0.827245 
## [240]    valid-auc:0.827253 
## [241]    valid-auc:0.827255 
## [242]    valid-auc:0.827265 
## [243]    valid-auc:0.827270 
## [244]    valid-auc:0.827262 
## [245]    valid-auc:0.827259 
## [246]    valid-auc:0.827262 
## [247]    valid-auc:0.827254 
## [248]    valid-auc:0.827244 
## [249]    valid-auc:0.827250 
## [250]    valid-auc:0.827249 
## [251]    valid-auc:0.827232 
## [252]    valid-auc:0.827229 
## [253]    valid-auc:0.827226 
## [254]    valid-auc:0.827225 
## [255]    valid-auc:0.827230 
## [256]    valid-auc:0.827237 
## Stopping. Best iteration:
## [226]    valid-auc:0.827282
pred_prob_xgb <- predict(xgb_clf, dtest_xgb)
pred_xgb <- ifelse(pred_prob_xgb > 0.5, 1, 0)

pred_xgb_factor <- factor(pred_xgb, levels = c(0,1))
test_factor_xgb <- factor(test_y_num, levels = c(0,1))

xgb_conf <- table(Predicted = pred_xgb_factor, Actual = test_factor_xgb)

xgb_acc  <- mean(pred_xgb == test_y_num)
xgb_prec <- caret::precision(pred_xgb_factor, test_factor_xgb)
xgb_rec  <- caret::recall(pred_xgb_factor, test_factor_xgb)
xgb_f1   <- caret::F_meas(pred_xgb_factor, test_factor_xgb)
xgb_auc  <- auc(test_y_num, pred_prob_xgb)

cat("\nXGBoost Confusion Matrix:\n")
## 
## XGBoost Confusion Matrix:
print(xgb_conf)
##          Actual
## Predicted     0     1
##         0 28691   970
##         1 12232  3832
cat(sprintf(
  "\nXGBoost Accuracy: %.4f | Precision: %.4f | Recall: %.4f | F1: %.4f | AUC: %.4f\n",
  xgb_acc, xgb_prec, xgb_rec, xgb_f1, xgb_auc
))
## 
## XGBoost Accuracy: 0.7113 | Precision: 0.9673 | Recall: 0.7011 | F1: 0.8130 | AUC: 0.8246

11B. Regression Modelling

This section develops regression models to predict BMI as a continuous outcome. Two ensemble approaches are evaluated: 1. Random Forest Regression (Ranger) tuned using Bayesian Optimization. 2. XGBoost Regression tuned using Random Search with cross-validation.

Both models are evaluated on the test set using RMSE, MAE, MAPE, R², and Adjusted R².

library(ranger)
library(Metrics)
library(ParBayesianOptimization)
library(dplyr)
library(xgboost)

Classification Summary

11B.1 Random Forest Regression (Ranger) with Bayesian Optimization

11B.1.1 Evaluation Metric Definition

Adjusted R² is included to account for the number of predictors used in the model.

adj_r2 <- function(y_true, y_pred, p) {
  r2_val <- Metrics::r2(y_true, y_pred)
  n <- length(y_true)
  return(1 - (1 - r2_val) * ((n - 1) / (n - p - 1)))
}

p <- ncol(train_reg) - 1 
11B.1.2 Bayesian Optimization for Ranger (Validation RMSE)

Bayesian Optimization is used to tune mtry, min.node.size, and sample.fraction. The optimization objective is to minimize validation RMSE.

rf_bayes <- function(mtry, min_node_size, sample_fraction) {
  
  model <- ranger(
    BMI ~ .,
    data = train_reg,
    num.trees = 500,
    mtry = as.integer(mtry),
    min.node.size = as.integer(min_node_size),
    sample.fraction = sample_fraction,
    importance = "impurity"
  )
  
  pred_valid <- predict(model, data = valid_reg)$predictions
  rmse_valid <- Metrics::rmse(valid_reg$BMI, pred_valid)
  
  return(list(Score = -rmse_valid))  
}

cat("Running Bayesian Optimization for Ranger...\n\n")
## Running Bayesian Optimization for Ranger...
opt_results_rf <- bayesOpt(
  FUN = rf_bayes,
  bounds = list(
    mtry = c(2L, 15L),
    min_node_size = c(2L, 20L),
    sample_fraction = c(0.5, 1.0)
  ),
  initPoints = 8,
  iters.n = 20,
  acq = "ucb",
  kappa = 2.5,
  verbose = TRUE
)
## 
## Running initial scoring function 8 times in 1 thread(s)...
best_params_rf <- getBestPars(opt_results_rf)
cat("\nBest Ranger Parameters:\n")
## 
## Best Ranger Parameters:
print(best_params_rf)
## $mtry
## [1] 3
## 
## $min_node_size
## [1] 20
## 
## $sample_fraction
## [1] 0.5
11B.1.3 Train Final Ranger Model and Evaluate on Test Set

The final tuned Random Forest regression model is trained and evaluated using standard regression metrics.

rf_model_final <- ranger(
  BMI ~ .,
  data = train_reg,
  num.trees = 700,
  mtry = as.integer(best_params_rf$mtry),
  min.node.size = as.integer(best_params_rf$min_node_size),
  sample.fraction = best_params_rf$sample_fraction,
  importance = "impurity"
)

# =================
#  Predictions
# =================

pred_rf_train <- predict(rf_model_final, data = train_reg)$predictions
pred_rf_valid <- predict(rf_model_final, data = valid_reg)$predictions
pred_rf_test  <- predict(rf_model_final, data = test_reg)$predictions

# =================================
#  Final Evaluation (Test Set) 
# =================================

rf_rmse <- rmse(test_reg$BMI, pred_rf_test)
rf_mae  <- mae(test_reg$BMI, pred_rf_test)
rf_mape <- mape(test_reg$BMI, pred_rf_test)

# caret R2 (predictions first!)
rf_r2 <- caret::R2(pred_rf_test, test_reg$BMI)

# Adjusted R²
p <- ncol(test_reg) - 1
n <- nrow(test_reg)
rf_adj_r2 <- 1 - (1 - rf_r2) * ((n - 1) / (n - p - 1))

cat("\n========= RANDOM FOREST (Ranger) FINAL PERFORMANCE =========\n")
## 
## ========= RANDOM FOREST (Ranger) FINAL PERFORMANCE =========
cat(sprintf(
  "Test RMSE: %.4f | MAE: %.4f | MAPE: %.2f%% | R²: %.4f | Adj R²: %.4f\n",
  rf_rmse, rf_mae, rf_mape, rf_r2, rf_adj_r2
))
## Test RMSE: 6.3207 | MAE: 4.4754 | MAPE: 0.16% | R²: 0.1301 | Adj R²: 0.1298

Regression Summary

Overall, the regression task compares Random Forest regression and XGBoost regression for predicting BMI. Performance is evaluated using both error-based metrics (RMSE, MAE, MAPE) and goodness-of-fit metrics (R², Adjusted R²). The results provide evidence on how demographic, lifestyle, and chronic condition indicators relate to BMI variation within the dataset.