Project2

Author

Radzhana Rabdanova

Published

March 8, 2026

Code

library(tidyverse)
library(ggplot2)

# --- 1. FAO Healthy Diet Dataset ---

# Data Source- This dataset is provided by the Food and Agriculture Organization (FAO). It tracks the cost of a healthy diet across different countries and regions from 2017 to 2024. Structure Before Tidying. The raw data is in a "wide" format where each year (Y2017, Y2018, etc.) is a separate column.

fao_raw <- read.csv("C:/Users/radzh/OneDrive/Desktop/data 607/project2/csv/fao_diet_raw.csv", check.names = FALSE)
# Displaying initial structure
glimpse(fao_raw)

Rows: 3,606
Columns: 18
$ `Area Code`       <chr> "2,\"'004\",\"Afghanistan\",\"7005\",\"Prevalence of…
$ `Area Code (M49)` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Area              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ `Item Code`       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Item              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ `Element Code`    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Element           <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ `Release Code`    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Release           <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Unit              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2017             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2018             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2019             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2020             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2021             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2022             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2023             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2024             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …

Code

#Transformation Steps. We use pivot_longer to move the year columns into a single "Year" variable, which is essential for time-series analysis.

fao_tidy <- fao_raw %>%
  select(Area, Item, starts_with("Y20")) %>%
  pivot_longer(cols = starts_with("Y20"), 
               names_to = "Year", 
               values_to = "Cost") %>%
  mutate(
    Year = as.numeric(gsub("Y", "", Year)),
    Cost = as.numeric(as.character(Cost))
  ) %>%
  drop_na(Cost)

#Analysis & Results. We analyze the average cost trend across all reporting areas.
fao_summary <- fao_tidy %>%
  group_by(Year) %>%
  summarize(Avg_Cost = mean(Cost, na.rm = TRUE))

knitr::kable(fao_summary, col.names = c("Year", "Average Cost (PPP dollar)"))

Year	Average Cost (PPP dollar)

Code

ggplot(fao_summary, aes(x = Year, y = Avg_Cost)) +
  geom_line(color = "blue", size = 1) +
  geom_point() +
  labs(title = "Average Cost of Healthy Diet (2017-2024)", 
       y = "Mean Cost (PPP$)", x = "Year") +
  theme_minimal()

Code

#Interpretation: The data shows a steady increase in the cost of a healthy diet globally, particularly accelerating after 2020.

#2. Dataset: Gym Members Exercise Tracking.Data Source- Sourced from Kaggle's Gym Members dataset, tracking physical metrics and workout types. Structure Before Tidying.The data contains separate columns for different heart rate metrics (Max, Avg, Resting).

gym_raw <- read.csv("C:/Users/radzh/OneDrive/Desktop/data 607/project2/csv/gym_members_exercise_tracking.csv")
head(gym_raw)

  Age Gender Weight..kg. Height..m. Max_BPM Avg_BPM Resting_BPM
1  56   Male        88.3       1.71     180     157          60
2  46 Female        74.9       1.53     179     151          66
3  32 Female        68.1       1.66     167     122          54
4  25   Male        53.2       1.70     190     164          56
5  38   Male        46.1       1.79     188     158          68
6  56 Female        58.0       1.68     168     156          74
  Session_Duration..hours. Calories_Burned Workout_Type Fat_Percentage
1                     1.69            1313         Yoga           12.6
2                     1.30             883         HIIT           33.9
3                     1.11             677       Cardio           33.4
4                     0.59             532     Strength           28.8
5                     0.64             556     Strength           29.2
6                     1.59            1116         HIIT           15.5
  Water_Intake..liters. Workout_Frequency..days.week. Experience_Level   BMI
1                   3.5                             4                3 30.20
2                   2.1                             4                2 32.00
3                   2.3                             4                2 24.71
4                   2.1                             3                1 18.41
5                   2.8                             3                1 14.39
6                   2.7                             5                3 20.55

Code

#Transformation Steps. To compare BPM types across workout categories, we pivot the three BPM columns into a long format.

gym_tidy <- gym_raw %>%
  select(Workout_Type, Max_BPM, Avg_BPM, Resting_BPM) %>%
  pivot_longer(cols = ends_with("BPM"), 
               names_to = "BPM_Type", 
               values_to = "BPM_Value")

#Analysis & Results

ggplot(gym_tidy, aes(x = Workout_Type, y = BPM_Value, fill = BPM_Type)) +
  geom_boxplot() +
  labs(title = "Heart Rate Distribution by Workout Type", 
       y = "Beats Per Minute", x = "Workout Category") +
  theme_minimal()

Code

#Interpretation: HIIT and Cardio workouts consistently show higher Average and Max BPM values compared to Yoga, which aligns with the intensity levels of these activities.

#3. Dataset: Global CO2 Emissions. Data Source- This dataset tracks CO2 emission rates (metric tons) per country and year. Structure Before Tidying. Data was initially provided in a format where emission rates were stored as characters, which prevents mathematical operations.

co2_raw <- read.csv("C:/Users/radzh/OneDrive/Desktop/data 607/project2/csv/tidy_format_co2_emission_dataset.csv")
str(co2_raw)

'data.frame':   5600 obs. of  3 variables:
 $ Country             : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
 $ Year                : chr  "2021" "2021" "2021" "2021" ...
 $ CO2EmissionRate..mt.: chr  "8.35" "4.59" "173" "24.45" ...

Code

#Transformation Steps. We clean the data by ensuring numeric types and filtering out incomplete records.

co2_clean <- co2_raw %>%
  mutate(
    Year = as.numeric(as.character(Year)),
    Emissions = as.numeric(as.character(CO2EmissionRate..mt.))
  ) %>%
  filter(!is.na(Emissions))

#Analysis & Results. Summary of total global emissions per year.

co2_summary <- co2_clean %>%
  group_by(Year) %>%
  summarize(Total_Emissions = sum(Emissions, na.rm = TRUE))

knitr::kable(co2_summary)

Year	Total_Emissions
1990	803.80
1991	819.30
1992	963.00
1993	974.20
1994	969.80
1995	967.00
1996	961.30
1997	967.40
1998	957.90
1999	939.50
2000	980.90
2001	969.50
2002	964.80
2003	1000.60
2004	1023.00
2005	1039.80
2006	1039.10
2007	1057.80
2008	1044.20
2009	988.10
2010	962.30
2011	956.40
2012	935.10
2013	919.40
2014	917.00
2015	1023.00
2018	1051.60
2021	13821.23
NA	56104.00

Code

ggplot(co2_summary, aes(x = Year, y = Total_Emissions)) +
  geom_col(fill = "darkred") +
  labs(title = "Total Global CO2 Emissions by Year", 
       y = "Total mt CO2", x = "Year") +
  theme_minimal()

Code

#Interpretation: The bar chart illustrates the total global CO2 output. Note any significant drops or spikes which could correlate with global economic shifts or environmental policies.

#Conclusion. By applying tidying principles—specifically moving from wide to long formats—we were able to create reproducible visualizations and summary tables. This workflow ensures that raw, messy data is transformed into a structured format ready for statistical insight.