Health and Fitness Project

Anthony Morciglio

09/13/2025

Visualizing the Components of the Health and Fitness as Related to Exercise

This R script performs a comprehensive statistical and machine learning analysis of a health and fitness dataset. The Health and Fitness dataset can be accessed in the following link: Kaggle. It begins with descriptive statistics of key variables, then explores relationships between features. Specifically, it runs a regression model predicting Calories_Burned based on BMI, Workout_Frequency, and Workout_Type. The code also calculates correlations, including the relationship between Water_Intake and Avg_BPM, and between Workout_Type and Calories_Burned, to assess how hydration and exercise style influence performance. To uncover patterns in member behavior, it applies K-means clustering on variables such as Age, Avg_BPM, and Calories_Burned, grouping individuals into distinct activity level categories. Additionally, the script visualizes the distributions of Calories_Burned and Session_Duration, highlighting measures of central tendency along with the 25th and 75th percentiles. Together, this analysis provides insights into how workout habits, body metrics, and exercise choices shape overall fitness outcomes.

# fitness_analysis.R
# Requirements: tidyverse, broom, scales, cluster, factoextra, ggplot2
library(tidyverse)
library(broom)
library(scales)
library(cluster)
library(ggplot2)

# ---- 1. Load data ----
# Replace with your actual path/filename
data_file <- "C:/Users/Anthony Morciglio/OneDrive/Projects/Programming/R/gym_members_exercise_tracking.csv"
#data_file <- "gym_members_exercise_tracking.csv"
if(!file.exists(data_file)) stop(paste("Data file not found:", data_file))
df <- read_csv(data_file, show_col_types = FALSE)

# Quick look
glimpse(df)
## Rows: 973
## Columns: 15
## $ Age                             <dbl> 56, 46, 32, 25, 38, 56, 36, 40, 28, 28…
## $ Gender                          <chr> "Male", "Female", "Female", "Male", "M…
## $ `Weight (kg)`                   <dbl> 88.3, 74.9, 68.1, 53.2, 46.1, 58.0, 70…
## $ `Height (m)`                    <dbl> 1.71, 1.53, 1.66, 1.70, 1.79, 1.68, 1.…
## $ Max_BPM                         <dbl> 180, 179, 167, 190, 188, 168, 174, 189…
## $ Avg_BPM                         <dbl> 157, 151, 122, 164, 158, 156, 169, 141…
## $ Resting_BPM                     <dbl> 60, 66, 54, 56, 68, 74, 73, 64, 52, 64…
## $ `Session_Duration (hours)`      <dbl> 1.69, 1.30, 1.11, 0.59, 0.64, 1.59, 1.…
## $ Calories_Burned                 <dbl> 1313, 883, 677, 532, 556, 1116, 1385, …
## $ Workout_Type                    <chr> "Yoga", "HIIT", "Cardio", "Strength", …
## $ Fat_Percentage                  <dbl> 12.6, 33.9, 33.4, 28.8, 29.2, 15.5, 21…
## $ `Water_Intake (liters)`         <dbl> 3.5, 2.1, 2.3, 2.1, 2.8, 2.7, 2.3, 1.9…
## $ `Workout_Frequency (days/week)` <dbl> 4, 4, 4, 3, 3, 5, 3, 3, 4, 3, 2, 3, 3,…
## $ Experience_Level                <dbl> 3, 2, 2, 1, 1, 3, 2, 2, 2, 1, 1, 2, 2,…
## $ BMI                             <dbl> 30.20, 32.00, 24.71, 18.41, 14.39, 20.…
# Ensure correct types
df <- df %>%
  mutate(
    Gender = as.factor(Gender),
    Workout_Type = as.factor(Workout_Type),
    Experience_Level = as.factor(Experience_Level)
  )

colnames(df) <- c("Age", "Gender","Weight","Height","Max_BPM","Avg_BPM",
                  "Resting_BPM","Session_Duration","Calories_Burned",
                  "Workout_Type", "Fat_Percentage", "Water_Intake",
                  "Workout_Frequency", "Experience_Level", "BMI")

# ---- 2. Descriptive statistics ----
numeric_vars <- c("Age","Weight","Height","Max_BPM","Avg_BPM","Resting_BPM","Session_Duration","Calories_Burned","Fat_Percentage","Water_Intake","Workout_Frequency","BMI")
num_df <- df %>% select(all_of(numeric_vars))

desc_stats <- num_df %>%
  summarise_all(list(
    n = ~sum(!is.na(.)),
    mean = ~mean(., na.rm=TRUE),
    sd = ~sd(., na.rm=TRUE),
    median = ~median(., na.rm=TRUE),
    p25 = ~quantile(., probs=0.25, na.rm=TRUE),
    p75 = ~quantile(., probs=0.75, na.rm=TRUE),
    min = ~min(., na.rm=TRUE),
    max = ~max(., na.rm=TRUE)
  )) %>% pivot_longer(everything(), names_to = "stat_var", values_to = "value")

# Print selected results for Session_Duration and Calories_Burned
sess_stats <- df %>% summarise(
  n = sum(!is.na(Session_Duration)),
  mean = mean(Session_Duration, na.rm=TRUE),
  sd = sd(Session_Duration, na.rm=TRUE),
  median = median(Session_Duration, na.rm=TRUE),
  p25 = quantile(Session_Duration, probs=0.25, na.rm=TRUE),
  p75 = quantile(Session_Duration, probs=0.75, na.rm=TRUE),
  skewness = (mean((Session_Duration - mean(Session_Duration, na.rm=TRUE))^3, na.rm=TRUE) / sd(Session_Duration, na.rm=TRUE)^3)
)
cal_stats <- df %>% summarise(
  n = sum(!is.na(Calories_Burned)),
  mean = mean(Calories_Burned, na.rm=TRUE),
  sd = sd(Calories_Burned, na.rm=TRUE),
  median = median(Calories_Burned, na.rm=TRUE),
  p25 = quantile(Calories_Burned, probs=0.25, na.rm=TRUE),
  p75 = quantile(Calories_Burned, probs=0.75, na.rm=TRUE),
  skewness = (mean((Calories_Burned - mean(Calories_Burned, na.rm=TRUE))^3, na.rm=TRUE) / sd(Calories_Burned, na.rm=TRUE)^3)
)

print("Session_Duration stats:")
## [1] "Session_Duration stats:"
print(sess_stats)
## # A tibble: 1 × 7
##       n  mean    sd median   p25   p75 skewness
##   <int> <dbl> <dbl>  <dbl> <dbl> <dbl>    <dbl>
## 1   973  1.26 0.343   1.26  1.04  1.46   0.0257
print("Calories_Burned stats:")
## [1] "Calories_Burned stats:"
print(cal_stats)
## # A tibble: 1 × 7
##       n  mean    sd median   p25   p75 skewness
##   <int> <dbl> <dbl>  <dbl> <dbl> <dbl>    <dbl>
## 1   973  905.  273.    893   720  1076    0.277
# ---- 3. Plot distributions ----
p1 <- ggplot(df, aes(x = Session_Duration)) +
  geom_histogram(
    aes(y = ..density..), 
    bins = 40, 
    fill = "#1abc9c",  # A vibrant teal color
    color = "black",   # Adds a black border to the bars
    alpha = 0.8        # Slightly increase the transparency for the density curve to show
  ) +
  geom_density(size = 1.2, color = "#2c3e50") + # Make the density line thicker and a darker color
  geom_vline(aes(xintercept = median(Session_Duration, na.rm = TRUE)), 
             color = "#f39c12", linetype = "dashed", size = 1) + # Use a bright color for the median line
  geom_vline(aes(xintercept = mean(Session_Duration, na.rm = TRUE)), 
             color = "#e74c3c", linetype = "dashed", size = 1) +  # Use another bright color for the mean line
  labs(
    title = "Distribution of Session Duration",
    subtitle = "Median (Yellow) and Mean (Red) are shown",
    x = "Session Duration (hours)",
    y = "Density"
  ) +
  theme_minimal(base_size = 14) # Increase base font size for better readability


p2 <- ggplot(df, aes(x = Calories_Burned)) +
  geom_histogram(
    aes(y = ..density..), 
    bins = 40, 
    fill = "#1abc9c",  # A vibrant teal color
    color = "black",   # Adds a black border to the bars
    alpha = 0.8        # Slightly increase the transparency for the density curve to show
  ) +
  geom_density(size = 1.2, color = "#2c3e50") + # Make the density line thicker and a darker color
  geom_vline(aes(xintercept = median(Calories_Burned, na.rm = TRUE)), 
             color = "#f39c12", linetype = "dashed", size = 1) + # Use a bright color for the median line
  geom_vline(aes(xintercept = mean(Calories_Burned, na.rm = TRUE)), 
             color = "#e74c3c", linetype = "dashed", size = 1) +  # Use another bright color for the mean line
  labs(
    title = "Distribution of Calories Burned",
    subtitle = "Median (Yellow) and Mean (Red) are shown",
    x = "Calories Burned",
    y = "Density"
  ) +
  theme_minimal(base_size = 14) # Increase base font size for better readability


ggsave("session_duration_density.png", plot=p1, width=8, height=4)
ggsave("calories_burned_density.png", plot=p2, width=8, height=4)

print(p1)

print(p2)

# ---- 4. Comparative analysis: Workout_Type & Experience_Level ----
# Example: Calories_Burned by Workout_Type (ANOVA if approx normal, else Kruskal-Wallis)
df %>% group_by(Workout_Type) %>% summarise(n = n(), mean_cal = mean(Calories_Burned, na.rm=TRUE), med_cal = median(Calories_Burned, na.rm=TRUE)) %>% arrange(desc(mean_cal)) %>% print(n=50)
## # A tibble: 4 × 4
##   Workout_Type     n mean_cal med_cal
##   <fct>        <int>    <dbl>   <dbl>
## 1 HIIT           221     926.    920 
## 2 Strength       258     911.    904.
## 3 Yoga           239     903.    876 
## 4 Cardio         255     885.    888
kruskal_test <- kruskal.test(Calories_Burned ~ Workout_Type, data = df)
print(kruskal_test)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Calories_Burned by Workout_Type
## Kruskal-Wallis chi-squared = 1.4837, df = 3, p-value = 0.686
# By Experience_Level
df %>% group_by(Experience_Level) %>% summarise(n = n(), mean_cal = mean(Calories_Burned, na.rm=TRUE), med_cal = median(Calories_Burned, na.rm=TRUE)) %>% print()
## # A tibble: 3 × 4
##   Experience_Level     n mean_cal med_cal
##   <fct>            <int>    <dbl>   <dbl>
## 1 1                  376     726.    720.
## 2 2                  406     902.    886 
## 3 3                  191    1265.   1242
# ---- 5. Regression: predict Calories_Burned ----
# Prepare dataset: drop NA and convert categorical to factor
reg_df <- df %>% select(Calories_Burned, BMI, Session_Duration, Avg_BPM, Workout_Frequency, Workout_Type) %>%
  filter(!is.na(Calories_Burned) & !is.na(BMI) & !is.na(Session_Duration) & !is.na(Avg_BPM) & !is.na(Workout_Frequency))

# Fit baseline linear model
lm1 <- lm(Calories_Burned ~ BMI + Session_Duration + Avg_BPM + Workout_Frequency + Workout_Type, data = reg_df)
summary(lm1)
## 
## Call:
## lm(formula = Calories_Burned ~ BMI + Session_Duration + Avg_BPM + 
##     Workout_Frequency + Workout_Type, data = reg_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -200.045  -46.413   -0.184   46.021  252.821 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -936.4417    25.9792 -36.046  < 2e-16 ***
## BMI                     2.3876     0.3393   7.037 3.71e-12 ***
## Session_Duration      720.2914     8.6130  83.628  < 2e-16 ***
## Avg_BPM                 6.1524     0.1574  39.085  < 2e-16 ***
## Workout_Frequency      -1.1485     3.2329  -0.355    0.722    
## Workout_TypeHIIT       -4.1567     6.4782  -0.642    0.521    
## Workout_TypeStrength   -3.1593     6.2242  -0.508    0.612    
## Workout_TypeYoga       -6.1797     6.3456  -0.974    0.330    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70.32 on 965 degrees of freedom
## Multiple R-squared:  0.934,  Adjusted R-squared:  0.9335 
## F-statistic:  1950 on 7 and 965 DF,  p-value: < 2.2e-16
# Check residual dispersion and optionally fit log model if skewed
lm1_diag <- augment(lm1)
# Optionally try log-transform
lm_log <- lm(log(Calories_Burned + 1) ~ BMI + Session_Duration + Avg_BPM + Workout_Frequency + Workout_Type, data = reg_df)
summary(lm_log)
## 
## Call:
## lm(formula = log(Calories_Burned + 1) ~ BMI + Session_Duration + 
##     Avg_BPM + Workout_Frequency + Workout_Type, data = reg_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29310 -0.06196  0.01398  0.05730  0.17572 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           4.6506975  0.0323118 143.932  < 2e-16 ***
## BMI                   0.0026851  0.0004220   6.363 3.05e-10 ***
## Session_Duration      0.8764606  0.0107125  81.816  < 2e-16 ***
## Avg_BPM               0.0068826  0.0001958  35.155  < 2e-16 ***
## Workout_Frequency    -0.0141935  0.0040210  -3.530 0.000435 ***
## Workout_TypeHIIT     -0.0009114  0.0080573  -0.113 0.909960    
## Workout_TypeStrength  0.0031545  0.0077414   0.407 0.683742    
## Workout_TypeYoga     -0.0047422  0.0078924  -0.601 0.548075    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08746 on 965 degrees of freedom
## Multiple R-squared:  0.9271, Adjusted R-squared:  0.9266 
## F-statistic:  1754 on 7 and 965 DF,  p-value: < 2.2e-16
# Save model summaries
capture.output(summary(lm1), file = "lm_calories_summary.txt")
capture.output(summary(lm_log), file = "lm_log_calories_summary.txt")

# ---- 6. Clustering: k-means on standardized features ----
clust_df <- df %>%
  select(Age, Avg_BPM, Session_Duration, Calories_Burned, Workout_Frequency) %>%
  drop_na()

# scale features
clust_scaled <- clust_df %>% mutate_all(~scale(.) %>% as.numeric())
set.seed(42)
# choose k=3 (adjustable)
k <- 3
km <- kmeans(clust_scaled, centers = k, nstart = 25)
clust_df$cluster <- as.factor(km$cluster)

# cluster centers in original scale (for interpretation)
centers_unscaled <- km$centers %>% as.data.frame()
# map centers back to original by reversing scaling
# We need the scaling parameters:
scales <- map(clust_df %>% select(Age, Avg_BPM, Session_Duration, Calories_Burned, Workout_Frequency), ~list(mean=mean(.), sd=sd(.)))
# compute centroid in original scale
centroids_orig <- as.data.frame(t(apply(km$centers, 1, function(row){
  vars <- names(clust_df %>% select(Age, Avg_BPM, Session_Duration, Calories_Burned, Workout_Frequency))
  sapply(seq_along(row), function(i){
    row[i] * sd(clust_df[[vars[i]]], na.rm=TRUE) + mean(clust_df[[vars[i]]], na.rm=TRUE)
  })
})))
names(centroids_orig) <- names(clust_df %>% select(Age, Avg_BPM, Session_Duration, Calories_Burned, Workout_Frequency))
centroids_orig$cluster <- 1:nrow(centroids_orig)
write_csv(centroids_orig, "cluster_centroids.csv")

# add clusters to original df (safe left join by row number)
df_with_clusters <- df %>%
  mutate(.row = row_number()) %>%
  left_join(clust_df %>% mutate(.row = row_number()) %>% select(.row, cluster), by = ".row") %>%
  select(-.row)

# save cluster sizes
table(df_with_clusters$cluster) %>% as.data.frame() %>% write_csv("cluster_sizes.csv")

# Visualize clusters

# Assuming 'clust_scaled' is your scaled data and 'km' is your kmeans result
# First, perform PCA on your data
res_pca <- prcomp(clust_scaled, scale. = TRUE)

# Next, create a data frame with the first two principal components
# These will be your x and y axes for the plot
plot_data <- data.frame(
  PC1 = res_pca$x[, 1],
  PC2 = res_pca$x[, 2],
  Cluster = factor(km$cluster) # Add your cluster assignments as a factor
)

# Create the scatter plot
ggplot(plot_data, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point() +  # This plots the points for each observation
  stat_ellipse(aes(group = Cluster), type = "norm") + # This adds the cluster ellipses
  labs(title = "K-means clusters (scaled features)") + # Add your title
  theme_minimal() # Use a clean, minimal theme

# ---- 7. Percentiles needed to move between clusters ----
# For each cluster, compute median and percentiles for the key metrics
cluster_summary <- clust_df %>% 
  mutate(cluster = as.factor(km$cluster)) %>%
  group_by(cluster) %>%
  summarise(
    n = n(),
    med_Cal = median(Calories_Burned, na.rm=TRUE),
    p25_Cal = quantile(Calories_Burned, probs=0.25, na.rm=TRUE),
    p75_Cal = quantile(Calories_Burned, probs=0.75, na.rm=TRUE),
    med_Sess = median(Session_Duration, na.rm=TRUE),
    p25_Sess = quantile(Session_Duration, probs=0.25, na.rm=TRUE),
    p75_Sess = quantile(Session_Duration, probs=0.75, na.rm=TRUE)
  )
write_csv(cluster_summary, "cluster_summary.csv")
print(cluster_summary)
## # A tibble: 3 × 8
##   cluster     n med_Cal p25_Cal p75_Cal med_Sess p25_Sess p75_Sess
##   <fct>   <int>   <dbl>   <dbl>   <dbl>    <dbl>    <dbl>    <dbl>
## 1 1         356     947    848    1050.     1.26     1.1      1.36
## 2 2         235    1198   1070.   1356.     1.71     1.54     1.85
## 3 3         382     692    553.    800.     1.04     0.81     1.21
# Determine thresholds: For a person in a low cluster, what percentile of Calories_Burned would they need to reach to fall into higher cluster?
# Simple approach: For each variable, compute the 90th percentile of lower cluster and 10th percentile of upper cluster to suggest overlap zone.
cluster_bounds <- clust_df %>% mutate(cluster = as.factor(km$cluster)) %>%
  group_by(cluster) %>%
  summarise(p10_Cal = quantile(Calories_Burned, probs=0.10, na.rm=TRUE),
            p90_Cal = quantile(Calories_Burned, probs=0.90, na.rm=TRUE),
            p10_Sess = quantile(Session_Duration, probs=0.10, na.rm=TRUE),
            p90_Sess = quantile(Session_Duration, probs=0.90, na.rm=TRUE))
write_csv(cluster_bounds, "cluster_bounds.csv")
print(cluster_bounds)
## # A tibble: 3 × 5
##   cluster p10_Cal p90_Cal p10_Sess p90_Sess
##   <fct>     <dbl>   <dbl>    <dbl>    <dbl>
## 1 1          753     1148     1.01     1.43
## 2 2          957.    1487     1.42     1.94
## 3 3          440      876     0.63     1.37
# Example: compute what percentile a given person would need for Calories_Burned to exceed median of higher cluster
# Let's pick cluster 3 is highest activity (inspect centroids to confirm)
# Compute percentiles over whole dataset
all_cal_percentiles <- ecdf(clust_df$Calories_Burned)
all_sess_percentiles <- ecdf(clust_df$Session_Duration)
# Example: value needed to reach 75th percentile:
cal_75_value <- quantile(clust_df$Calories_Burned, probs=0.75, na.rm=TRUE)
sess_75_value <- quantile(clust_df$Session_Duration, probs=0.75, na.rm=TRUE)
cat("Example thresholds (75th percentiles): Calories_Burned:", cal_75_value, " Session_Duration:", sess_75_value, "\n")
## Example thresholds (75th percentiles): Calories_Burned: 1076  Session_Duration: 1.46
# Assuming your dataframe is named 'df'
# And the relevant columns are in the dataframe

# Run the multiple linear regression model
# The tilde (~) separates the dependent variable from the independent variables
model <- lm(Calories_Burned ~ Session_Duration + Resting_BPM + Max_BPM + Fat_Percentage + Age, data = df)

# Print a summary of the model results
summary(model)
## 
## Call:
## lm(formula = Calories_Burned ~ Session_Duration + Resting_BPM + 
##     Max_BPM + Fat_Percentage + Age, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -300.65  -71.30   -5.32   73.25  338.18 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      273.6581    67.0472   4.082 4.84e-05 ***
## Session_Duration 671.1796    12.0989  55.474  < 2e-16 ***
## Resting_BPM        1.1086     0.4611   2.404   0.0164 *  
## Max_BPM           -0.2555     0.2930  -0.872   0.3833    
## Fat_Percentage    -4.6088     0.6629  -6.952 6.61e-12 ***
## Age               -3.0872     0.2771 -11.142  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 105.2 on 967 degrees of freedom
## Multiple R-squared:  0.8519, Adjusted R-squared:  0.8512 
## F-statistic:  1113 on 5 and 967 DF,  p-value: < 2.2e-16
library(ggplot2)
library(patchwork) # For combining plots

# Scatter plot for Session Duration
p_duration <- ggplot(df, aes(x = Session_Duration, y = Calories_Burned)) +
  geom_point(alpha = 0.5, color = "dodgerblue") +
  geom_smooth(method = "lm", se = FALSE, color = "firebrick", size = 1.2) +
  labs(title = "Calories vs. Session Duration",
       x = "Session Duration", y = "Calories Burned") +
  theme_minimal()

# Scatter plot for Max BPM
p_maxbpm <- ggplot(df, aes(x = Max_BPM, y = Calories_Burned)) +
  geom_point(alpha = 0.5, color = "forestgreen") +
  geom_smooth(method = "lm", se = FALSE, color = "orange", size = 1.2) +
  labs(title = "Calories vs. Max BPM",
       x = "Max Beats Per Minute", y = "Calories Burned") +
  theme_minimal()

# Scatter plot for Fat Percentage
p_fat <- ggplot(df, aes(x = Fat_Percentage, y = Calories_Burned)) +
  geom_point(alpha = 0.5, color = "purple") +
  geom_smooth(method = "lm", se = FALSE, color = "black", size = 1.2) +
  labs(title = "Calories vs. Fat Percentage",
       x = "Fat Percentage", y = "Calories Burned") +
  theme_minimal()

# Combine the plots into a single view
p_duration + p_maxbpm + p_fat + plot_layout(ncol = 3)

# ---- 9. Supplementary Information ----
# Load necessary libraries for data manipulation and visualization
# If you don't have these installed, run install.packages("ggplot2"), install.packages("patchwork"), and install.packages("dplyr")
# Load necessary libraries for data manipulation and visualization
# If you don't have these installed, run install.packages("ggplot2"), install.packages("patchwork"), and install.packages("dplyr")
library(ggplot2)
library(patchwork)
library(dplyr)

# --- SECTION 1: Creating a Sample Dataset for Demonstration ---
# This code creates a dummy dataset that you can replace with your actual data.
# The structure and variable names are designed to match your request.
set.seed(42) # For reproducibility
num_records <- 200

df <- data.frame(
  Workout_Type = as.factor(sample(c("Cardio", "Strength", "HIIT", "Yoga"), num_records, replace = TRUE)),
  Calories_Burned = rnorm(num_records, mean = 250, sd = 70),
  # Fat_Percentage column has been removed and replaced with Avg_BPM
  Water_Intake = rnorm(num_records, mean = 2.5, sd = 0.8),
  Resting_BPM = rnorm(num_records, mean = 65, sd = 8),
  BMI = rnorm(num_records, mean = 24, sd = 3),
  Avg_BPM = rnorm(num_records, mean = 110, sd = 15) # New column for Average BPM
)

# Adjust values to be more realistic for each workout type and other factors
df <- df %>%
  mutate(
    Calories_Burned = case_when(
      Workout_Type == "HIIT" ~ Calories_Burned + 100,
      Workout_Type == "Strength" ~ Calories_Burned + 50,
      TRUE ~ Calories_Burned
    ),
    # Adjust Avg_BPM based on Workout_Type
    Avg_BPM = case_when(
      Workout_Type == "HIIT" ~ Avg_BPM + 20,
      Workout_Type == "Cardio" ~ Avg_BPM + 10,
      Workout_Type == "Yoga" ~ Avg_BPM - 30,
      TRUE ~ Avg_BPM
    ),
    Resting_BPM = case_when(
      Water_Intake > 3 ~ Resting_BPM - 5,
      Water_Intake < 2 ~ Resting_BPM + 5,
      TRUE ~ Resting_BPM
    )
  )

# --- SECTION 2: Analysis of Workout_Type on Calories_Burned and Avg_BPM ---

# Description:
# Since Workout_Type is a categorical variable, a great way to visualize its
# effect on numerical variables is with box plots or violin plots. These plots
# show the distribution of data for each category, allowing you to easily
# compare the median, spread, and outliers.

# Visualization 1: Calories Burned by Workout Type
plot_calories <- ggplot(df, aes(x = Workout_Type, y = Calories_Burned, fill = Workout_Type)) +
  geom_violin(trim = FALSE) +
  geom_boxplot(width = 0.2, alpha = 0.7, fill = "white") +
  labs(
    title = "Calories Burned by Workout Type",
    subtitle = "Higher median calories burned for HIIT and Strength workouts",
    x = "Workout Type",
    y = "Calories Burned (kcal)"
  ) +
  theme_minimal() +
  scale_fill_viridis_d() +
  theme(legend.position = "none")

# Visualization 2: Average BPM by Workout Type
plot_avg_bpm <- ggplot(df, aes(x = Workout_Type, y = Avg_BPM, fill = Workout_Type)) +
  geom_violin(trim = FALSE) +
  geom_boxplot(width = 0.2, alpha = 0.7, fill = "white") +
  labs(
    title = "Average BPM by Workout Type",
    subtitle = "HIIT and Cardio workouts show a higher average BPM",
    x = "Workout Type",
    y = "Average BPM"
  ) +
  theme_minimal() +
  scale_fill_viridis_d() +
  theme(legend.position = "none")

# Combine the two plots for a side-by-side comparison
combined_workout_plots <- plot_calories + plot_avg_bpm

# Print the combined plot
combined_workout_plots

# Statistical Analysis (ANOVA):
# To formally test if the average Calories_Burned or Avg_BPM is
# statistically different across workout types, we can use a one-way ANOVA.
cat("\n--- ANOVA Results for Calories Burned by Workout Type ---\n")
## 
## --- ANOVA Results for Calories Burned by Workout Type ---
anova_calories <- aov(Calories_Burned ~ Workout_Type, data = df)
print(summary(anova_calories))
##               Df Sum Sq Mean Sq F value   Pr(>F)    
## Workout_Type   3 171769   57256    12.7 1.29e-07 ***
## Residuals    196 884019    4510                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n--- ANOVA Results for Average BPM by Workout Type ---\n")
## 
## --- ANOVA Results for Average BPM by Workout Type ---
anova_avg_bpm <- aov(Avg_BPM ~ Workout_Type, data = df)
print(summary(anova_avg_bpm))
##               Df Sum Sq Mean Sq F value Pr(>F)    
## Workout_Type   3  51491   17164    63.3 <2e-16 ***
## Residuals    196  53148     271                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# --- SECTION 3: Analysis of Water_Intake on Resting_BPM, BMI, and Avg_BPM ---

# Description:
# For visualizing the relationship between two continuous variables, a scatter plot
# with a linear regression line is a perfect tool. The line shows the overall trend,
# and the shaded area is the confidence interval for that trend.

# Visualization 1: Water Intake vs. Resting BPM
plot_bpm <- ggplot(df, aes(x = Water_Intake, y = Resting_BPM)) +
  geom_point(alpha = 0.6, color = "darkblue") +
  geom_smooth(method = "lm", se = TRUE, color = "skyblue") +
  labs(
    title = "Water Intake vs. Resting BPM",
    x = "Daily Water Intake (L)",
    y = "Resting BPM"
  ) +
  theme_minimal()

# Visualization 2: Water Intake vs. BMI
plot_bmi <- ggplot(df, aes(x = Water_Intake, y = BMI)) +
  geom_point(alpha = 0.6, color = "darkgreen") +
  geom_smooth(method = "lm", se = TRUE, color = "lightgreen") +
  labs(
    title = "Water Intake vs. BMI",
    x = "Daily Water Intake (L)",
    y = "BMI"
  ) +
  theme_minimal()

# Visualization 3: Water Intake vs. Average BPM
plot_avg_bpm_water <- ggplot(df, aes(x = Water_Intake, y = Avg_BPM)) +
  geom_point(alpha = 0.6, color = "darkred") +
  geom_smooth(method = "lm", se = TRUE, color = "orange") +
  labs(
    title = "Water Intake vs. Average BPM",
    x = "Daily Water Intake (L)",
    y = "Average BPM"
  ) +
  theme_minimal()

# Combine the three plots into a single layout
combined_water_plots <- plot_bpm + plot_bmi + plot_avg_bpm_water

# Print the combined plot
combined_water_plots

# ---- 10. End message ----
cat("Analysis complete. Plots saved: session_duration_density.png, calories_burned_density.png, kmeans_clusters.png. \n")
## Analysis complete. Plots saved: session_duration_density.png, calories_burned_density.png, kmeans_clusters.png.
cat("CSV outputs saved: cluster_centroids.csv, cluster_summary.csv, cluster_bounds.csv, lm_calories_summary.txt, lm_log_calories_summary.txt\n")
## CSV outputs saved: cluster_centroids.csv, cluster_summary.csv, cluster_bounds.csv, lm_calories_summary.txt, lm_log_calories_summary.txt