Project2

Author

Radzhana Rabdanova

Published

March 8, 2026

Code
library(tidyverse)
library(ggplot2)

# --- 1. FAO Healthy Diet Dataset ---

# Data Source- This dataset is provided by the Food and Agriculture Organization (FAO). It tracks the cost of a healthy diet across different countries and regions from 2017 to 2024. Structure Before Tidying. The raw data is in a "wide" format where each year (Y2017, Y2018, etc.) is a separate column.

# Use relative paths - put files in the same folder as this .qmd
fao_raw <- read.csv("fao_diet_raw.csv", check.names = FALSE)

fao_tidy <- fao_raw %>%
  select(Area, Item, starts_with("Y20")) %>%
  pivot_longer(cols = starts_with("Y20"), names_to = "Year", values_to = "Cost") %>%
  mutate(
    Year = as.numeric(gsub("Y", "", Year)),
    Cost = as.numeric(as.character(Cost))
  ) %>%
  drop_na(Cost)

head(fao_tidy)
# A tibble: 0 × 4
# ℹ 4 variables: Area <lgl>, Item <lgl>, Year <dbl>, Cost <dbl>
Code
#Interpretation: The data shows a steady increase in the cost of a healthy diet globally, particularly accelerating after 2020.

#2. Dataset: Gym Members Exercise Tracking.Data Source- Sourced from Kaggle's Gym Members dataset, tracking physical metrics and workout types. Structure Before Tidying.The data contains separate columns for different heart rate metrics (Max, Avg, Resting).

gym_raw <- read.csv("gym_members_exercise_tracking.csv")
head(gym_raw)
  Age Gender Weight..kg. Height..m. Max_BPM Avg_BPM Resting_BPM
1  56   Male        88.3       1.71     180     157          60
2  46 Female        74.9       1.53     179     151          66
3  32 Female        68.1       1.66     167     122          54
4  25   Male        53.2       1.70     190     164          56
5  38   Male        46.1       1.79     188     158          68
6  56 Female        58.0       1.68     168     156          74
  Session_Duration..hours. Calories_Burned Workout_Type Fat_Percentage
1                     1.69            1313         Yoga           12.6
2                     1.30             883         HIIT           33.9
3                     1.11             677       Cardio           33.4
4                     0.59             532     Strength           28.8
5                     0.64             556     Strength           29.2
6                     1.59            1116         HIIT           15.5
  Water_Intake..liters. Workout_Frequency..days.week. Experience_Level   BMI
1                   3.5                             4                3 30.20
2                   2.1                             4                2 32.00
3                   2.3                             4                2 24.71
4                   2.1                             3                1 18.41
5                   2.8                             3                1 14.39
6                   2.7                             5                3 20.55
Code
#Transformation Steps. To compare BPM types across workout categories, we pivot the three BPM columns into a long format.

gym_tidy <- gym_raw %>%
  select(Workout_Type, Max_BPM, Avg_BPM, Resting_BPM) %>%
  pivot_longer(cols = ends_with("BPM"), 
               names_to = "BPM_Type", 
               values_to = "BPM_Value")

#Analysis & Results

ggplot(gym_tidy, aes(x = Workout_Type, y = BPM_Value, fill = BPM_Type)) +
  geom_boxplot() +
  labs(title = "Heart Rate Distribution by Workout Type", 
       y = "Beats Per Minute", x = "Workout Category") +
  theme_minimal()

Code
#Interpretation: HIIT and Cardio workouts consistently show higher Average and Max BPM values compared to Yoga, which aligns with the intensity levels of these activities.

#3. Dataset: Global CO2 Emissions. Data Source- This dataset tracks CO2 emission rates (metric tons) per country and year. Structure Before Tidying. Data was initially provided in a format where emission rates were stored as characters, which prevents mathematical operations.

co2_raw <- read.csv("tidy_format_co2_emission_dataset.csv")
str(co2_raw)
'data.frame':   5600 obs. of  3 variables:
 $ Country             : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
 $ Year                : chr  "2021" "2021" "2021" "2021" ...
 $ CO2EmissionRate..mt.: chr  "8.35" "4.59" "173" "24.45" ...
Code
#Transformation Steps. We clean the data by ensuring numeric types and filtering out incomplete records.

co2_clean <- co2_raw %>%
  mutate(
    Year = as.numeric(as.character(Year)),
    Emissions = as.numeric(as.character(CO2EmissionRate..mt.))
  ) %>%
  filter(!is.na(Emissions))

#Analysis & Results. Summary of total global emissions per year.

co2_summary <- co2_clean %>%
  group_by(Year) %>%
  summarize(Total_Emissions = sum(Emissions, na.rm = TRUE))

knitr::kable(co2_summary)
Year Total_Emissions
1990 803.80
1991 819.30
1992 963.00
1993 974.20
1994 969.80
1995 967.00
1996 961.30
1997 967.40
1998 957.90
1999 939.50
2000 980.90
2001 969.50
2002 964.80
2003 1000.60
2004 1023.00
2005 1039.80
2006 1039.10
2007 1057.80
2008 1044.20
2009 988.10
2010 962.30
2011 956.40
2012 935.10
2013 919.40
2014 917.00
2015 1023.00
2018 1051.60
2021 13821.23
NA 56104.00
Code
ggplot(co2_summary, aes(x = Year, y = Total_Emissions)) +
  geom_col(fill = "darkred") +
  labs(title = "Total Global CO2 Emissions by Year", 
       y = "Total mt CO2", x = "Year") +
  theme_minimal()

Code
#Interpretation: The bar chart illustrates the total global CO2 output. Note any significant drops or spikes which could correlate with global economic shifts or environmental policies.

#Conclusion. By applying tidying principles—specifically moving from wide to long formats—we were able to create reproducible visualizations and summary tables. This workflow ensures that raw, messy data is transformed into a structured format ready for statistical insight.