library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.2.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(trelliscopejs)
## This package is no longer maintained. Please use the 'trelliscope' package instead (see https://github.com/trelliscope/).
LifeData <- read.csv("Final_data.csv")
str(LifeData)
## 'data.frame': 20000 obs. of 54 variables:
## $ Age : num 34.9 23.4 33.2 38.7 45.1 ...
## $ Gender : chr "Male" "Female" "Female" "Female" ...
## $ Weight..kg. : num 65.3 56.4 59 93.8 52.4 ...
## $ Height..m. : num 1.62 1.55 1.67 1.7 1.88 1.84 1.78 1.63 1.79 1.6 ...
## $ Max_BPM : num 189 179 175 191 194 ...
## $ Avg_BPM : num 158 132 124 155 153 ...
## $ Resting_BPM : num 69 73.2 55 50.1 70.8 ...
## $ Session_Duration..hours. : num 1 1.37 0.91 1.1 1.08 0.69 1.67 1.01 1.76 1.17 ...
## $ Calories_Burned : num 1081 1810 802 1451 1166 ...
## $ Workout_Type : chr "Strength" "HIIT" "Cardio" "HIIT" ...
## $ Fat_Percentage : num 26.8 27.7 24.3 32.8 17.3 ...
## $ Water_Intake..liters. : num 1.5 1.9 1.88 2.5 2.91 2.91 2.71 2.88 3.49 2.49 ...
## $ Workout_Frequency..days.week. : num 3.99 4 2.99 3.99 4 3.02 4.96 3.97 4.01 2 ...
## $ Experience_Level : num 2.01 2.01 1.02 1.99 2 1 3 2.01 3.02 1 ...
## $ BMI : num 24.9 23.5 21.1 32.5 14.8 ...
## $ Daily.meals.frequency : num 2.99 3.01 1.99 3 3 2.99 2.02 2.99 3 3.01 ...
## $ Physical.exercise : num 0.01 0.97 -0.02 0.04 3 -0.04 -0.03 0 0.02 0.02 ...
## $ Carbs : num 268 214 246 203 333 ...
## $ Proteins : num 106 85.4 98.1 80.8 133.1 ...
## $ Fats : num 71.6 57 65.5 54.6 88.4 ...
## $ Calories : num 1806 1577 1608 2657 1470 ...
## $ meal_name : chr "Other" "Other" "Other" "Other" ...
## $ meal_type : chr "Lunch" "Lunch" "Breakfast" "Lunch" ...
## $ diet_type : chr "Vegan" "Vegetarian" "Paleo" "Paleo" ...
## $ sugar_g : num 31.77 12.34 42.81 9.34 23.78 ...
## $ sodium_mg : num 1730 693 2142 123 1935 ...
## $ cholesterol_mg : num 285.1 300.6 215.4 9.7 116.9 ...
## $ serving_size_g : num 120.5 109.2 399.4 314.3 99.2 ...
## $ cooking_method : chr "Grilled" "Fried" "Boiled" "Fried" ...
## $ prep_time_min : num 16.2 16.5 54.4 27.7 34.2 ...
## $ cook_time_min : num 110.79 12.01 6.09 103.72 46.55 ...
## $ rating : num 1.31 1.92 4.7 4.85 3.07 3.38 3.81 3.16 2.81 1.6 ...
## $ Name.of.Exercise : chr "Decline Push-ups" "Bear Crawls" "Dips" "Mountain Climbers" ...
## $ Sets : num 4.99 4.01 5 4.01 4.99 4 5.01 4.97 3.99 4.02 ...
## $ Reps : num 20.9 16.1 21.9 16.9 15 ...
## $ Benefit : chr "Improves shoulder health and posture" "Strengthens lower abs" "Builds chest strength" "Improves coordination and cardiovascular health" ...
## $ Burns.Calories..per.30.min. : num 343 357 360 352 329 ...
## $ Target.Muscle.Group : chr "Shoulders, Triceps" "Back, Core, Shoulders" "Quadriceps, Glutes" "Biceps, Forearms" ...
## $ Equipment.Needed : chr "Cable Machine" "Step or Box" "Step or Box" "Parallel Bars or Chair" ...
## $ Difficulty.Level : chr "Advanced" "Intermediate" "Intermediate" "Advanced" ...
## $ Body.Part : chr "Legs" "Chest" "Arms" "Shoulders" ...
## $ Type.of.Muscle : chr "Lats" "Lats" "Grip Strength" "Upper" ...
## $ Workout : chr "Dumbbell flyes" "Lateral raises" "Standing calf raises" "Incline dumbbell flyes" ...
## $ BMI_calc : num 24.9 23.5 21.1 32.4 14.8 ...
## $ cal_from_macros : num 2140 1712 1966 1627 2659 ...
## $ pct_carbs : num 0.5 0.501 0.501 0.5 0.501 ...
## $ protein_per_kg : num 1.625 1.514 1.663 0.862 2.538 ...
## $ pct_HRR : num 0.741 0.551 0.575 0.744 0.668 ...
## $ pct_maxHR : num 0.836 0.734 0.708 0.811 0.79 ...
## $ cal_balance : num 725 -233 806 1206 304 ...
## $ lean_mass_kg : num 47.8 40.8 44.6 63 43.3 ...
## $ expected_burn : num 685 979 655 774 711 ...
## $ Burns.Calories..per.30.min._bc: num 7.26e+19 1.02e+20 1.08e+20 8.99e+19 5.26e+19 ...
## $ Burns_Calories_Bin : chr "Medium" "High" "High" "High" ...
View(LifeData)
# look at BMI, Diet type, and age.
# this plot will look if you are on a specific diet, how it will change your BMI and if age has an influencing factor
sublife <- LifeData[, c("BMI", "diet_type", "Age")]
sublife["rAge"] <- round(sublife$Age, 0)
summary(sublife$BMI)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.04 20.10 24.12 24.92 28.56 50.23
# check for NA values
ld <- sublife %>%
filter(!is.na(BMI) & !is.na(diet_type))
# look at 5 num sum for each diet type
sums <- sublife %>%
group_by(diet_type) %>%
summarise(
Min = min(BMI, na.rm = TRUE),
Q1 = quantile(BMI, 0.25, na.rm = TRUE),
Median = median(BMI, na.rm = TRUE),
Q3 = quantile(BMI, 0.75, na.rm = TRUE),
Max = max(BMI, na.rm = TRUE),
Mean = mean(BMI, na.rm = TRUE),
sd = sd(BMI, na.rm = TRUE)
)
sublife %>%
ggplot(aes(x=BMI))+
geom_histogram(fill = 'steelblue')+
facet_trelliscope(~diet_type,
nrow = 1, ncol = 1,
name = "BMI Distribution by Diet Type",
path = '.')+
labs(title = "BMI Distribution Across Diet Types",
x = "BMI",
y='Counts')
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## using data from the first layer
##
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
The data set used for this analysis was sourced from Kaggle, and it is a diet and exercise data set that provides comprehensive information on individuals’ fitness, nutrition, and exercise habits. Including physical measures, workout performance metrics, and dietary details. The data set has 20,000 observations, and 54 variables all of varying data types and structures. Of these variables body mass index (BMI), and diet type were examined to understand if an individuals diet had and affect on the overall distribution of BMI among the observed population. The standard deviation was also used to understand the spread of these distributions and if the diet type had a real affect.
Initial analysis from the five number summaries showed that there was no extreme skew among the observed variables, and that spread was relatively low as well, indicating that each diet type should resemble a symmetric distribution. The primary variable graphed was BMI. This variable was chosen because BMI is a standard measure for assessing body composition and is widely used to categorize population health risks. By examining its distribution, this gives an initial understanding of the overall physical health profile of the data sets subjects. The motivation for investigating BMI is to see if its distribution is sensitive to different lifestyle choices.
The primary variable chosen to facet the plots was Diet Type. The purpose of this faceting was to directly compare the distribution of BMI across distinct dietary groups. This comparison aims to investigate the trend that a specific diet might lead to a measurable shift in body weight potentially identifying which diets correlate with a lower BMI. From the graphs, it is shown that all of the diets have a very similar distribution with Keto having the largest number of unusually high BMI measures. However it can be seen that not one particular diet leads to a lower BMI as all of the distributions as a very similar spread.
A key challenge involved determining the best way to visualize this relationship while also being able to glean meaningful results. Age was initially included as well to see if age, affected BMI across different diets but because of the large sample size it seemed that a scatter plot was not going to work. Then made the switch to side-by-side bloxplots, but there was still an issue of interpret ability and further proving that age may not be helpful when examining this relationship.
For further exploration, the reader could investigate a Balance Ratio, calculated by daily calorie intake - calories buried to an individuals BMI. This measure directly quantifies the daily energy surplus or deficit relative to and individuals BMI. This allows insights into which diet natrually results in a sustained, or higher calorie deficit creating the most favorable conditions for healthy weight management.