Code
library(tidyverse)
library(ggplot2)
# --- 1. FAO Healthy Diet Dataset ---
# Data Source- This dataset is provided by the Food and Agriculture Organization (FAO). It tracks the cost of a healthy diet across different countries and regions from 2017 to 2024. Structure Before Tidying. The raw data is in a "wide" format where each year (Y2017, Y2018, etc.) is a separate column.
fao_raw <- read.csv("C:/Users/radzh/OneDrive/Desktop/data 607/project2/csv/fao_diet_raw.csv", check.names = FALSE)
# Displaying initial structure
glimpse(fao_raw)Rows: 3,606
Columns: 18
$ `Area Code` <chr> "2,\"'004\",\"Afghanistan\",\"7005\",\"Prevalence of…
$ `Area Code (M49)` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Area <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ `Item Code` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Item <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ `Element Code` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Element <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ `Release Code` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Release <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Unit <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2017 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2018 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2019 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2020 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2021 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2022 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2023 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ Y2024 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
Code
#Transformation Steps. We use pivot_longer to move the year columns into a single "Year" variable, which is essential for time-series analysis.
fao_tidy <- fao_raw %>%
select(Area, Item, starts_with("Y20")) %>%
pivot_longer(cols = starts_with("Y20"),
names_to = "Year",
values_to = "Cost") %>%
mutate(
Year = as.numeric(gsub("Y", "", Year)),
Cost = as.numeric(as.character(Cost))
) %>%
drop_na(Cost)
#Analysis & Results. We analyze the average cost trend across all reporting areas.
fao_summary <- fao_tidy %>%
group_by(Year) %>%
summarize(Avg_Cost = mean(Cost, na.rm = TRUE))
knitr::kable(fao_summary, col.names = c("Year", "Average Cost (PPP dollar)"))| Year | Average Cost (PPP dollar) |
|---|
Code
ggplot(fao_summary, aes(x = Year, y = Avg_Cost)) +
geom_line(color = "blue", size = 1) +
geom_point() +
labs(title = "Average Cost of Healthy Diet (2017-2024)",
y = "Mean Cost (PPP$)", x = "Year") +
theme_minimal()Code
#Interpretation: The data shows a steady increase in the cost of a healthy diet globally, particularly accelerating after 2020.
#2. Dataset: Gym Members Exercise Tracking.Data Source- Sourced from Kaggle's Gym Members dataset, tracking physical metrics and workout types. Structure Before Tidying.The data contains separate columns for different heart rate metrics (Max, Avg, Resting).
gym_raw <- read.csv("C:/Users/radzh/OneDrive/Desktop/data 607/project2/csv/gym_members_exercise_tracking.csv")
head(gym_raw) Age Gender Weight..kg. Height..m. Max_BPM Avg_BPM Resting_BPM
1 56 Male 88.3 1.71 180 157 60
2 46 Female 74.9 1.53 179 151 66
3 32 Female 68.1 1.66 167 122 54
4 25 Male 53.2 1.70 190 164 56
5 38 Male 46.1 1.79 188 158 68
6 56 Female 58.0 1.68 168 156 74
Session_Duration..hours. Calories_Burned Workout_Type Fat_Percentage
1 1.69 1313 Yoga 12.6
2 1.30 883 HIIT 33.9
3 1.11 677 Cardio 33.4
4 0.59 532 Strength 28.8
5 0.64 556 Strength 29.2
6 1.59 1116 HIIT 15.5
Water_Intake..liters. Workout_Frequency..days.week. Experience_Level BMI
1 3.5 4 3 30.20
2 2.1 4 2 32.00
3 2.3 4 2 24.71
4 2.1 3 1 18.41
5 2.8 3 1 14.39
6 2.7 5 3 20.55
Code
#Transformation Steps. To compare BPM types across workout categories, we pivot the three BPM columns into a long format.
gym_tidy <- gym_raw %>%
select(Workout_Type, Max_BPM, Avg_BPM, Resting_BPM) %>%
pivot_longer(cols = ends_with("BPM"),
names_to = "BPM_Type",
values_to = "BPM_Value")
#Analysis & Results
ggplot(gym_tidy, aes(x = Workout_Type, y = BPM_Value, fill = BPM_Type)) +
geom_boxplot() +
labs(title = "Heart Rate Distribution by Workout Type",
y = "Beats Per Minute", x = "Workout Category") +
theme_minimal()Code
#Interpretation: HIIT and Cardio workouts consistently show higher Average and Max BPM values compared to Yoga, which aligns with the intensity levels of these activities.
#3. Dataset: Global CO2 Emissions. Data Source- This dataset tracks CO2 emission rates (metric tons) per country and year. Structure Before Tidying. Data was initially provided in a format where emission rates were stored as characters, which prevents mathematical operations.
co2_raw <- read.csv("C:/Users/radzh/OneDrive/Desktop/data 607/project2/csv/tidy_format_co2_emission_dataset.csv")
str(co2_raw)'data.frame': 5600 obs. of 3 variables:
$ Country : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
$ Year : chr "2021" "2021" "2021" "2021" ...
$ CO2EmissionRate..mt.: chr "8.35" "4.59" "173" "24.45" ...
Code
#Transformation Steps. We clean the data by ensuring numeric types and filtering out incomplete records.
co2_clean <- co2_raw %>%
mutate(
Year = as.numeric(as.character(Year)),
Emissions = as.numeric(as.character(CO2EmissionRate..mt.))
) %>%
filter(!is.na(Emissions))
#Analysis & Results. Summary of total global emissions per year.
co2_summary <- co2_clean %>%
group_by(Year) %>%
summarize(Total_Emissions = sum(Emissions, na.rm = TRUE))
knitr::kable(co2_summary)| Year | Total_Emissions |
|---|---|
| 1990 | 803.80 |
| 1991 | 819.30 |
| 1992 | 963.00 |
| 1993 | 974.20 |
| 1994 | 969.80 |
| 1995 | 967.00 |
| 1996 | 961.30 |
| 1997 | 967.40 |
| 1998 | 957.90 |
| 1999 | 939.50 |
| 2000 | 980.90 |
| 2001 | 969.50 |
| 2002 | 964.80 |
| 2003 | 1000.60 |
| 2004 | 1023.00 |
| 2005 | 1039.80 |
| 2006 | 1039.10 |
| 2007 | 1057.80 |
| 2008 | 1044.20 |
| 2009 | 988.10 |
| 2010 | 962.30 |
| 2011 | 956.40 |
| 2012 | 935.10 |
| 2013 | 919.40 |
| 2014 | 917.00 |
| 2015 | 1023.00 |
| 2018 | 1051.60 |
| 2021 | 13821.23 |
| NA | 56104.00 |
Code
ggplot(co2_summary, aes(x = Year, y = Total_Emissions)) +
geom_col(fill = "darkred") +
labs(title = "Total Global CO2 Emissions by Year",
y = "Total mt CO2", x = "Year") +
theme_minimal()Code
#Interpretation: The bar chart illustrates the total global CO2 output. Note any significant drops or spikes which could correlate with global economic shifts or environmental policies.
#Conclusion. By applying tidying principles—specifically moving from wide to long formats—we were able to create reproducible visualizations and summary tables. This workflow ensures that raw, messy data is transformed into a structured format ready for statistical insight.