This R script performs a comprehensive statistical and machine learning analysis of a health and fitness dataset. The Health and Fitness dataset can be accessed in the following link: Kaggle. It begins with descriptive statistics of key variables, then explores relationships between features. Specifically, it runs a regression model predicting Calories_Burned based on BMI, Workout_Frequency, and Workout_Type. The code also calculates correlations, including the relationship between Water_Intake and Avg_BPM, and between Workout_Type and Calories_Burned, to assess how hydration and exercise style influence performance. To uncover patterns in member behavior, it applies K-means clustering on variables such as Age, Avg_BPM, and Calories_Burned, grouping individuals into distinct activity level categories. Additionally, the script visualizes the distributions of Calories_Burned and Session_Duration, highlighting measures of central tendency along with the 25th and 75th percentiles. Together, this analysis provides insights into how workout habits, body metrics, and exercise choices shape overall fitness outcomes.
# fitness_analysis.R
# Requirements: tidyverse, broom, scales, cluster, factoextra, ggplot2
library(tidyverse)
library(broom)
library(scales)
library(cluster)
library(ggplot2)
# ---- 1. Load data ----
# Replace with your actual path/filename
data_file <- "C:/Users/Anthony Morciglio/OneDrive/Projects/Programming/R/gym_members_exercise_tracking.csv"
#data_file <- "gym_members_exercise_tracking.csv"
if(!file.exists(data_file)) stop(paste("Data file not found:", data_file))
df <- read_csv(data_file, show_col_types = FALSE)
# Quick look
glimpse(df)
## Rows: 973
## Columns: 15
## $ Age <dbl> 56, 46, 32, 25, 38, 56, 36, 40, 28, 28…
## $ Gender <chr> "Male", "Female", "Female", "Male", "M…
## $ `Weight (kg)` <dbl> 88.3, 74.9, 68.1, 53.2, 46.1, 58.0, 70…
## $ `Height (m)` <dbl> 1.71, 1.53, 1.66, 1.70, 1.79, 1.68, 1.…
## $ Max_BPM <dbl> 180, 179, 167, 190, 188, 168, 174, 189…
## $ Avg_BPM <dbl> 157, 151, 122, 164, 158, 156, 169, 141…
## $ Resting_BPM <dbl> 60, 66, 54, 56, 68, 74, 73, 64, 52, 64…
## $ `Session_Duration (hours)` <dbl> 1.69, 1.30, 1.11, 0.59, 0.64, 1.59, 1.…
## $ Calories_Burned <dbl> 1313, 883, 677, 532, 556, 1116, 1385, …
## $ Workout_Type <chr> "Yoga", "HIIT", "Cardio", "Strength", …
## $ Fat_Percentage <dbl> 12.6, 33.9, 33.4, 28.8, 29.2, 15.5, 21…
## $ `Water_Intake (liters)` <dbl> 3.5, 2.1, 2.3, 2.1, 2.8, 2.7, 2.3, 1.9…
## $ `Workout_Frequency (days/week)` <dbl> 4, 4, 4, 3, 3, 5, 3, 3, 4, 3, 2, 3, 3,…
## $ Experience_Level <dbl> 3, 2, 2, 1, 1, 3, 2, 2, 2, 1, 1, 2, 2,…
## $ BMI <dbl> 30.20, 32.00, 24.71, 18.41, 14.39, 20.…
# Ensure correct types
df <- df %>%
mutate(
Gender = as.factor(Gender),
Workout_Type = as.factor(Workout_Type),
Experience_Level = as.factor(Experience_Level)
)
colnames(df) <- c("Age", "Gender","Weight","Height","Max_BPM","Avg_BPM",
"Resting_BPM","Session_Duration","Calories_Burned",
"Workout_Type", "Fat_Percentage", "Water_Intake",
"Workout_Frequency", "Experience_Level", "BMI")
# ---- 2. Descriptive statistics ----
numeric_vars <- c("Age","Weight","Height","Max_BPM","Avg_BPM","Resting_BPM","Session_Duration","Calories_Burned","Fat_Percentage","Water_Intake","Workout_Frequency","BMI")
num_df <- df %>% select(all_of(numeric_vars))
desc_stats <- num_df %>%
summarise_all(list(
n = ~sum(!is.na(.)),
mean = ~mean(., na.rm=TRUE),
sd = ~sd(., na.rm=TRUE),
median = ~median(., na.rm=TRUE),
p25 = ~quantile(., probs=0.25, na.rm=TRUE),
p75 = ~quantile(., probs=0.75, na.rm=TRUE),
min = ~min(., na.rm=TRUE),
max = ~max(., na.rm=TRUE)
)) %>% pivot_longer(everything(), names_to = "stat_var", values_to = "value")
# Print selected results for Session_Duration and Calories_Burned
sess_stats <- df %>% summarise(
n = sum(!is.na(Session_Duration)),
mean = mean(Session_Duration, na.rm=TRUE),
sd = sd(Session_Duration, na.rm=TRUE),
median = median(Session_Duration, na.rm=TRUE),
p25 = quantile(Session_Duration, probs=0.25, na.rm=TRUE),
p75 = quantile(Session_Duration, probs=0.75, na.rm=TRUE),
skewness = (mean((Session_Duration - mean(Session_Duration, na.rm=TRUE))^3, na.rm=TRUE) / sd(Session_Duration, na.rm=TRUE)^3)
)
cal_stats <- df %>% summarise(
n = sum(!is.na(Calories_Burned)),
mean = mean(Calories_Burned, na.rm=TRUE),
sd = sd(Calories_Burned, na.rm=TRUE),
median = median(Calories_Burned, na.rm=TRUE),
p25 = quantile(Calories_Burned, probs=0.25, na.rm=TRUE),
p75 = quantile(Calories_Burned, probs=0.75, na.rm=TRUE),
skewness = (mean((Calories_Burned - mean(Calories_Burned, na.rm=TRUE))^3, na.rm=TRUE) / sd(Calories_Burned, na.rm=TRUE)^3)
)
print("Session_Duration stats:")
## [1] "Session_Duration stats:"
print(sess_stats)
## # A tibble: 1 × 7
## n mean sd median p25 p75 skewness
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 973 1.26 0.343 1.26 1.04 1.46 0.0257
print("Calories_Burned stats:")
## [1] "Calories_Burned stats:"
print(cal_stats)
## # A tibble: 1 × 7
## n mean sd median p25 p75 skewness
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 973 905. 273. 893 720 1076 0.277
# ---- 3. Plot distributions ----
p1 <- ggplot(df, aes(x = Session_Duration)) +
geom_histogram(
aes(y = ..density..),
bins = 40,
fill = "#1abc9c", # A vibrant teal color
color = "black", # Adds a black border to the bars
alpha = 0.8 # Slightly increase the transparency for the density curve to show
) +
geom_density(size = 1.2, color = "#2c3e50") + # Make the density line thicker and a darker color
geom_vline(aes(xintercept = median(Session_Duration, na.rm = TRUE)),
color = "#f39c12", linetype = "dashed", size = 1) + # Use a bright color for the median line
geom_vline(aes(xintercept = mean(Session_Duration, na.rm = TRUE)),
color = "#e74c3c", linetype = "dashed", size = 1) + # Use another bright color for the mean line
labs(
title = "Distribution of Session Duration",
subtitle = "Median (Yellow) and Mean (Red) are shown",
x = "Session Duration (hours)",
y = "Density"
) +
theme_minimal(base_size = 14) # Increase base font size for better readability
p2 <- ggplot(df, aes(x = Calories_Burned)) +
geom_histogram(
aes(y = ..density..),
bins = 40,
fill = "#1abc9c", # A vibrant teal color
color = "black", # Adds a black border to the bars
alpha = 0.8 # Slightly increase the transparency for the density curve to show
) +
geom_density(size = 1.2, color = "#2c3e50") + # Make the density line thicker and a darker color
geom_vline(aes(xintercept = median(Calories_Burned, na.rm = TRUE)),
color = "#f39c12", linetype = "dashed", size = 1) + # Use a bright color for the median line
geom_vline(aes(xintercept = mean(Calories_Burned, na.rm = TRUE)),
color = "#e74c3c", linetype = "dashed", size = 1) + # Use another bright color for the mean line
labs(
title = "Distribution of Calories Burned",
subtitle = "Median (Yellow) and Mean (Red) are shown",
x = "Calories Burned",
y = "Density"
) +
theme_minimal(base_size = 14) # Increase base font size for better readability
ggsave("session_duration_density.png", plot=p1, width=8, height=4)
ggsave("calories_burned_density.png", plot=p2, width=8, height=4)
print(p1)
print(p2)
# ---- 4. Comparative analysis: Workout_Type & Experience_Level ----
# Example: Calories_Burned by Workout_Type (ANOVA if approx normal, else Kruskal-Wallis)
df %>% group_by(Workout_Type) %>% summarise(n = n(), mean_cal = mean(Calories_Burned, na.rm=TRUE), med_cal = median(Calories_Burned, na.rm=TRUE)) %>% arrange(desc(mean_cal)) %>% print(n=50)
## # A tibble: 4 × 4
## Workout_Type n mean_cal med_cal
## <fct> <int> <dbl> <dbl>
## 1 HIIT 221 926. 920
## 2 Strength 258 911. 904.
## 3 Yoga 239 903. 876
## 4 Cardio 255 885. 888
kruskal_test <- kruskal.test(Calories_Burned ~ Workout_Type, data = df)
print(kruskal_test)
##
## Kruskal-Wallis rank sum test
##
## data: Calories_Burned by Workout_Type
## Kruskal-Wallis chi-squared = 1.4837, df = 3, p-value = 0.686
# By Experience_Level
df %>% group_by(Experience_Level) %>% summarise(n = n(), mean_cal = mean(Calories_Burned, na.rm=TRUE), med_cal = median(Calories_Burned, na.rm=TRUE)) %>% print()
## # A tibble: 3 × 4
## Experience_Level n mean_cal med_cal
## <fct> <int> <dbl> <dbl>
## 1 1 376 726. 720.
## 2 2 406 902. 886
## 3 3 191 1265. 1242
# ---- 5. Regression: predict Calories_Burned ----
# Prepare dataset: drop NA and convert categorical to factor
reg_df <- df %>% select(Calories_Burned, BMI, Session_Duration, Avg_BPM, Workout_Frequency, Workout_Type) %>%
filter(!is.na(Calories_Burned) & !is.na(BMI) & !is.na(Session_Duration) & !is.na(Avg_BPM) & !is.na(Workout_Frequency))
# Fit baseline linear model
lm1 <- lm(Calories_Burned ~ BMI + Session_Duration + Avg_BPM + Workout_Frequency + Workout_Type, data = reg_df)
summary(lm1)
##
## Call:
## lm(formula = Calories_Burned ~ BMI + Session_Duration + Avg_BPM +
## Workout_Frequency + Workout_Type, data = reg_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -200.045 -46.413 -0.184 46.021 252.821
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -936.4417 25.9792 -36.046 < 2e-16 ***
## BMI 2.3876 0.3393 7.037 3.71e-12 ***
## Session_Duration 720.2914 8.6130 83.628 < 2e-16 ***
## Avg_BPM 6.1524 0.1574 39.085 < 2e-16 ***
## Workout_Frequency -1.1485 3.2329 -0.355 0.722
## Workout_TypeHIIT -4.1567 6.4782 -0.642 0.521
## Workout_TypeStrength -3.1593 6.2242 -0.508 0.612
## Workout_TypeYoga -6.1797 6.3456 -0.974 0.330
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70.32 on 965 degrees of freedom
## Multiple R-squared: 0.934, Adjusted R-squared: 0.9335
## F-statistic: 1950 on 7 and 965 DF, p-value: < 2.2e-16
# Check residual dispersion and optionally fit log model if skewed
lm1_diag <- augment(lm1)
# Optionally try log-transform
lm_log <- lm(log(Calories_Burned + 1) ~ BMI + Session_Duration + Avg_BPM + Workout_Frequency + Workout_Type, data = reg_df)
summary(lm_log)
##
## Call:
## lm(formula = log(Calories_Burned + 1) ~ BMI + Session_Duration +
## Avg_BPM + Workout_Frequency + Workout_Type, data = reg_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29310 -0.06196 0.01398 0.05730 0.17572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.6506975 0.0323118 143.932 < 2e-16 ***
## BMI 0.0026851 0.0004220 6.363 3.05e-10 ***
## Session_Duration 0.8764606 0.0107125 81.816 < 2e-16 ***
## Avg_BPM 0.0068826 0.0001958 35.155 < 2e-16 ***
## Workout_Frequency -0.0141935 0.0040210 -3.530 0.000435 ***
## Workout_TypeHIIT -0.0009114 0.0080573 -0.113 0.909960
## Workout_TypeStrength 0.0031545 0.0077414 0.407 0.683742
## Workout_TypeYoga -0.0047422 0.0078924 -0.601 0.548075
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08746 on 965 degrees of freedom
## Multiple R-squared: 0.9271, Adjusted R-squared: 0.9266
## F-statistic: 1754 on 7 and 965 DF, p-value: < 2.2e-16
# Save model summaries
capture.output(summary(lm1), file = "lm_calories_summary.txt")
capture.output(summary(lm_log), file = "lm_log_calories_summary.txt")
# ---- 6. Clustering: k-means on standardized features ----
clust_df <- df %>%
select(Age, Avg_BPM, Session_Duration, Calories_Burned, Workout_Frequency) %>%
drop_na()
# scale features
clust_scaled <- clust_df %>% mutate_all(~scale(.) %>% as.numeric())
set.seed(42)
# choose k=3 (adjustable)
k <- 3
km <- kmeans(clust_scaled, centers = k, nstart = 25)
clust_df$cluster <- as.factor(km$cluster)
# cluster centers in original scale (for interpretation)
centers_unscaled <- km$centers %>% as.data.frame()
# map centers back to original by reversing scaling
# We need the scaling parameters:
scales <- map(clust_df %>% select(Age, Avg_BPM, Session_Duration, Calories_Burned, Workout_Frequency), ~list(mean=mean(.), sd=sd(.)))
# compute centroid in original scale
centroids_orig <- as.data.frame(t(apply(km$centers, 1, function(row){
vars <- names(clust_df %>% select(Age, Avg_BPM, Session_Duration, Calories_Burned, Workout_Frequency))
sapply(seq_along(row), function(i){
row[i] * sd(clust_df[[vars[i]]], na.rm=TRUE) + mean(clust_df[[vars[i]]], na.rm=TRUE)
})
})))
names(centroids_orig) <- names(clust_df %>% select(Age, Avg_BPM, Session_Duration, Calories_Burned, Workout_Frequency))
centroids_orig$cluster <- 1:nrow(centroids_orig)
write_csv(centroids_orig, "cluster_centroids.csv")
# add clusters to original df (safe left join by row number)
df_with_clusters <- df %>%
mutate(.row = row_number()) %>%
left_join(clust_df %>% mutate(.row = row_number()) %>% select(.row, cluster), by = ".row") %>%
select(-.row)
# save cluster sizes
table(df_with_clusters$cluster) %>% as.data.frame() %>% write_csv("cluster_sizes.csv")
# Visualize clusters
# Assuming 'clust_scaled' is your scaled data and 'km' is your kmeans result
# First, perform PCA on your data
res_pca <- prcomp(clust_scaled, scale. = TRUE)
# Next, create a data frame with the first two principal components
# These will be your x and y axes for the plot
plot_data <- data.frame(
PC1 = res_pca$x[, 1],
PC2 = res_pca$x[, 2],
Cluster = factor(km$cluster) # Add your cluster assignments as a factor
)
# Create the scatter plot
ggplot(plot_data, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point() + # This plots the points for each observation
stat_ellipse(aes(group = Cluster), type = "norm") + # This adds the cluster ellipses
labs(title = "K-means clusters (scaled features)") + # Add your title
theme_minimal() # Use a clean, minimal theme
# ---- 7. Percentiles needed to move between clusters ----
# For each cluster, compute median and percentiles for the key metrics
cluster_summary <- clust_df %>%
mutate(cluster = as.factor(km$cluster)) %>%
group_by(cluster) %>%
summarise(
n = n(),
med_Cal = median(Calories_Burned, na.rm=TRUE),
p25_Cal = quantile(Calories_Burned, probs=0.25, na.rm=TRUE),
p75_Cal = quantile(Calories_Burned, probs=0.75, na.rm=TRUE),
med_Sess = median(Session_Duration, na.rm=TRUE),
p25_Sess = quantile(Session_Duration, probs=0.25, na.rm=TRUE),
p75_Sess = quantile(Session_Duration, probs=0.75, na.rm=TRUE)
)
write_csv(cluster_summary, "cluster_summary.csv")
print(cluster_summary)
## # A tibble: 3 × 8
## cluster n med_Cal p25_Cal p75_Cal med_Sess p25_Sess p75_Sess
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 356 947 848 1050. 1.26 1.1 1.36
## 2 2 235 1198 1070. 1356. 1.71 1.54 1.85
## 3 3 382 692 553. 800. 1.04 0.81 1.21
# Determine thresholds: For a person in a low cluster, what percentile of Calories_Burned would they need to reach to fall into higher cluster?
# Simple approach: For each variable, compute the 90th percentile of lower cluster and 10th percentile of upper cluster to suggest overlap zone.
cluster_bounds <- clust_df %>% mutate(cluster = as.factor(km$cluster)) %>%
group_by(cluster) %>%
summarise(p10_Cal = quantile(Calories_Burned, probs=0.10, na.rm=TRUE),
p90_Cal = quantile(Calories_Burned, probs=0.90, na.rm=TRUE),
p10_Sess = quantile(Session_Duration, probs=0.10, na.rm=TRUE),
p90_Sess = quantile(Session_Duration, probs=0.90, na.rm=TRUE))
write_csv(cluster_bounds, "cluster_bounds.csv")
print(cluster_bounds)
## # A tibble: 3 × 5
## cluster p10_Cal p90_Cal p10_Sess p90_Sess
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 1 753 1148 1.01 1.43
## 2 2 957. 1487 1.42 1.94
## 3 3 440 876 0.63 1.37
# Example: compute what percentile a given person would need for Calories_Burned to exceed median of higher cluster
# Let's pick cluster 3 is highest activity (inspect centroids to confirm)
# Compute percentiles over whole dataset
all_cal_percentiles <- ecdf(clust_df$Calories_Burned)
all_sess_percentiles <- ecdf(clust_df$Session_Duration)
# Example: value needed to reach 75th percentile:
cal_75_value <- quantile(clust_df$Calories_Burned, probs=0.75, na.rm=TRUE)
sess_75_value <- quantile(clust_df$Session_Duration, probs=0.75, na.rm=TRUE)
cat("Example thresholds (75th percentiles): Calories_Burned:", cal_75_value, " Session_Duration:", sess_75_value, "\n")
## Example thresholds (75th percentiles): Calories_Burned: 1076 Session_Duration: 1.46
# Assuming your dataframe is named 'df'
# And the relevant columns are in the dataframe
# Run the multiple linear regression model
# The tilde (~) separates the dependent variable from the independent variables
model <- lm(Calories_Burned ~ Session_Duration + Resting_BPM + Max_BPM + Fat_Percentage + Age, data = df)
# Print a summary of the model results
summary(model)
##
## Call:
## lm(formula = Calories_Burned ~ Session_Duration + Resting_BPM +
## Max_BPM + Fat_Percentage + Age, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -300.65 -71.30 -5.32 73.25 338.18
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 273.6581 67.0472 4.082 4.84e-05 ***
## Session_Duration 671.1796 12.0989 55.474 < 2e-16 ***
## Resting_BPM 1.1086 0.4611 2.404 0.0164 *
## Max_BPM -0.2555 0.2930 -0.872 0.3833
## Fat_Percentage -4.6088 0.6629 -6.952 6.61e-12 ***
## Age -3.0872 0.2771 -11.142 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 105.2 on 967 degrees of freedom
## Multiple R-squared: 0.8519, Adjusted R-squared: 0.8512
## F-statistic: 1113 on 5 and 967 DF, p-value: < 2.2e-16
library(ggplot2)
library(patchwork) # For combining plots
# Scatter plot for Session Duration
p_duration <- ggplot(df, aes(x = Session_Duration, y = Calories_Burned)) +
geom_point(alpha = 0.5, color = "dodgerblue") +
geom_smooth(method = "lm", se = FALSE, color = "firebrick", size = 1.2) +
labs(title = "Calories vs. Session Duration",
x = "Session Duration", y = "Calories Burned") +
theme_minimal()
# Scatter plot for Max BPM
p_maxbpm <- ggplot(df, aes(x = Max_BPM, y = Calories_Burned)) +
geom_point(alpha = 0.5, color = "forestgreen") +
geom_smooth(method = "lm", se = FALSE, color = "orange", size = 1.2) +
labs(title = "Calories vs. Max BPM",
x = "Max Beats Per Minute", y = "Calories Burned") +
theme_minimal()
# Scatter plot for Fat Percentage
p_fat <- ggplot(df, aes(x = Fat_Percentage, y = Calories_Burned)) +
geom_point(alpha = 0.5, color = "purple") +
geom_smooth(method = "lm", se = FALSE, color = "black", size = 1.2) +
labs(title = "Calories vs. Fat Percentage",
x = "Fat Percentage", y = "Calories Burned") +
theme_minimal()
# Combine the plots into a single view
p_duration + p_maxbpm + p_fat + plot_layout(ncol = 3)
# ---- 9. Supplementary Information ----
# Load necessary libraries for data manipulation and visualization
# If you don't have these installed, run install.packages("ggplot2"), install.packages("patchwork"), and install.packages("dplyr")
# Load necessary libraries for data manipulation and visualization
# If you don't have these installed, run install.packages("ggplot2"), install.packages("patchwork"), and install.packages("dplyr")
library(ggplot2)
library(patchwork)
library(dplyr)
# --- SECTION 1: Creating a Sample Dataset for Demonstration ---
# This code creates a dummy dataset that you can replace with your actual data.
# The structure and variable names are designed to match your request.
set.seed(42) # For reproducibility
num_records <- 200
df <- data.frame(
Workout_Type = as.factor(sample(c("Cardio", "Strength", "HIIT", "Yoga"), num_records, replace = TRUE)),
Calories_Burned = rnorm(num_records, mean = 250, sd = 70),
# Fat_Percentage column has been removed and replaced with Avg_BPM
Water_Intake = rnorm(num_records, mean = 2.5, sd = 0.8),
Resting_BPM = rnorm(num_records, mean = 65, sd = 8),
BMI = rnorm(num_records, mean = 24, sd = 3),
Avg_BPM = rnorm(num_records, mean = 110, sd = 15) # New column for Average BPM
)
# Adjust values to be more realistic for each workout type and other factors
df <- df %>%
mutate(
Calories_Burned = case_when(
Workout_Type == "HIIT" ~ Calories_Burned + 100,
Workout_Type == "Strength" ~ Calories_Burned + 50,
TRUE ~ Calories_Burned
),
# Adjust Avg_BPM based on Workout_Type
Avg_BPM = case_when(
Workout_Type == "HIIT" ~ Avg_BPM + 20,
Workout_Type == "Cardio" ~ Avg_BPM + 10,
Workout_Type == "Yoga" ~ Avg_BPM - 30,
TRUE ~ Avg_BPM
),
Resting_BPM = case_when(
Water_Intake > 3 ~ Resting_BPM - 5,
Water_Intake < 2 ~ Resting_BPM + 5,
TRUE ~ Resting_BPM
)
)
# --- SECTION 2: Analysis of Workout_Type on Calories_Burned and Avg_BPM ---
# Description:
# Since Workout_Type is a categorical variable, a great way to visualize its
# effect on numerical variables is with box plots or violin plots. These plots
# show the distribution of data for each category, allowing you to easily
# compare the median, spread, and outliers.
# Visualization 1: Calories Burned by Workout Type
plot_calories <- ggplot(df, aes(x = Workout_Type, y = Calories_Burned, fill = Workout_Type)) +
geom_violin(trim = FALSE) +
geom_boxplot(width = 0.2, alpha = 0.7, fill = "white") +
labs(
title = "Calories Burned by Workout Type",
subtitle = "Higher median calories burned for HIIT and Strength workouts",
x = "Workout Type",
y = "Calories Burned (kcal)"
) +
theme_minimal() +
scale_fill_viridis_d() +
theme(legend.position = "none")
# Visualization 2: Average BPM by Workout Type
plot_avg_bpm <- ggplot(df, aes(x = Workout_Type, y = Avg_BPM, fill = Workout_Type)) +
geom_violin(trim = FALSE) +
geom_boxplot(width = 0.2, alpha = 0.7, fill = "white") +
labs(
title = "Average BPM by Workout Type",
subtitle = "HIIT and Cardio workouts show a higher average BPM",
x = "Workout Type",
y = "Average BPM"
) +
theme_minimal() +
scale_fill_viridis_d() +
theme(legend.position = "none")
# Combine the two plots for a side-by-side comparison
combined_workout_plots <- plot_calories + plot_avg_bpm
# Print the combined plot
combined_workout_plots
# Statistical Analysis (ANOVA):
# To formally test if the average Calories_Burned or Avg_BPM is
# statistically different across workout types, we can use a one-way ANOVA.
cat("\n--- ANOVA Results for Calories Burned by Workout Type ---\n")
##
## --- ANOVA Results for Calories Burned by Workout Type ---
anova_calories <- aov(Calories_Burned ~ Workout_Type, data = df)
print(summary(anova_calories))
## Df Sum Sq Mean Sq F value Pr(>F)
## Workout_Type 3 171769 57256 12.7 1.29e-07 ***
## Residuals 196 884019 4510
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n--- ANOVA Results for Average BPM by Workout Type ---\n")
##
## --- ANOVA Results for Average BPM by Workout Type ---
anova_avg_bpm <- aov(Avg_BPM ~ Workout_Type, data = df)
print(summary(anova_avg_bpm))
## Df Sum Sq Mean Sq F value Pr(>F)
## Workout_Type 3 51491 17164 63.3 <2e-16 ***
## Residuals 196 53148 271
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# --- SECTION 3: Analysis of Water_Intake on Resting_BPM, BMI, and Avg_BPM ---
# Description:
# For visualizing the relationship between two continuous variables, a scatter plot
# with a linear regression line is a perfect tool. The line shows the overall trend,
# and the shaded area is the confidence interval for that trend.
# Visualization 1: Water Intake vs. Resting BPM
plot_bpm <- ggplot(df, aes(x = Water_Intake, y = Resting_BPM)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_smooth(method = "lm", se = TRUE, color = "skyblue") +
labs(
title = "Water Intake vs. Resting BPM",
x = "Daily Water Intake (L)",
y = "Resting BPM"
) +
theme_minimal()
# Visualization 2: Water Intake vs. BMI
plot_bmi <- ggplot(df, aes(x = Water_Intake, y = BMI)) +
geom_point(alpha = 0.6, color = "darkgreen") +
geom_smooth(method = "lm", se = TRUE, color = "lightgreen") +
labs(
title = "Water Intake vs. BMI",
x = "Daily Water Intake (L)",
y = "BMI"
) +
theme_minimal()
# Visualization 3: Water Intake vs. Average BPM
plot_avg_bpm_water <- ggplot(df, aes(x = Water_Intake, y = Avg_BPM)) +
geom_point(alpha = 0.6, color = "darkred") +
geom_smooth(method = "lm", se = TRUE, color = "orange") +
labs(
title = "Water Intake vs. Average BPM",
x = "Daily Water Intake (L)",
y = "Average BPM"
) +
theme_minimal()
# Combine the three plots into a single layout
combined_water_plots <- plot_bpm + plot_bmi + plot_avg_bpm_water
# Print the combined plot
combined_water_plots
# ---- 10. End message ----
cat("Analysis complete. Plots saved: session_duration_density.png, calories_burned_density.png, kmeans_clusters.png. \n")
## Analysis complete. Plots saved: session_duration_density.png, calories_burned_density.png, kmeans_clusters.png.
cat("CSV outputs saved: cluster_centroids.csv, cluster_summary.csv, cluster_bounds.csv, lm_calories_summary.txt, lm_log_calories_summary.txt\n")
## CSV outputs saved: cluster_centroids.csv, cluster_summary.csv, cluster_bounds.csv, lm_calories_summary.txt, lm_log_calories_summary.txt