=============================

Final Project Notes & Code

Dataset: obesity_data

=============================

# Load necessary packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.0
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
# Create dataset
set.seed(42)
obesity_data <- tibble(
  Region = c("Norte", "Centro", "Lisboa", "Alentejo", "Algarve"),
  Obesity = sample(18:30, 5, replace = TRUE),
  Diabetes = sample(5:12, 5, replace = TRUE),
  Fruits = sample(150:300, 5, replace = TRUE),
  Vegetables = sample(150:250, 5, replace = TRUE),
  SugarDrinks = sample(50:200, 5, replace = TRUE),
  ExerciseHours = sample(0:12, 5, replace = TRUE),
  SmokingRate = sample(5:35, 5, replace = TRUE),
  Population = c(350000, 250000, 500000, 150000, 200000),
  IncomePerCapita = c(20000, 18000, 25000, 17000, 22000),
  RandomVar1 = runif(5, 0, 1),  # irrelevant
  RandomVar2 = runif(5, 0, 100) # irrelevant
)

# =============================
# Part 1: Bar chart Obesity by Region
# =============================
ggplot(obesity_data, aes(x = reorder(Region, Obesity), y = Obesity, fill = Region)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = paste0(Obesity, "%")), vjust = -0.5) +
  labs(title = "Obesity percentage by region",
       x = "Region", y = "Obesity (%)") +
  theme_minimal()

# =============================
# Part 2: Scatterplot Fruits vs Obesity (interactive)
# =============================
p2 <- ggplot(obesity_data, aes(x = Fruits, y = Obesity,
                               text = paste("Region:", Region))) +
  geom_point(aes(color = Region, size = IncomePerCapita)) +
  labs(title = "Fruit consumption vs Obesity",
       x = "Fruits (g/day)", y = "Obesity (%)",
       color = "Region", size = "Income per capita (€)") +
  theme_minimal()

ggplotly(p2, tooltip = "text")
# =============================
# Part 3: Boxplot for Obesity, Diabetes, ExerciseHours
# =============================
obesity_long <- obesity_data %>%
  select(Region, Obesity, Diabetes, ExerciseHours) %>%
  pivot_longer(cols = c(Obesity, Diabetes, ExerciseHours),
               names_to = "Indicator", values_to = "Value")

ggplot(obesity_long, aes(x = Region, y = Value, fill = Indicator)) +
  geom_boxplot() +
  labs(title = "Distribution of health indicators by region",
       x = "Region", y = "Value", fill = "Indicator") +
  theme_minimal()

# =============================
# Part 4: Dumbbell chart Fruits vs Vegetables
# =============================
ggplot(obesity_data, aes(x = Fruits, y = Vegetables, label = Region, color = Region)) +
  geom_point(size = 4) +
  geom_text(nudge_y = 3) +
  labs(title = "Fruits vs Vegetables by Region",
       x = "Fruits (g/day)", y = "Vegetables (g/day)") +
  theme_minimal()

# =============================
# Part 5: Density plot ExerciseHours
# =============================
ggplot(obesity_data, aes(x = ExerciseHours, fill = Region)) +
  geom_density(alpha = 0.5) +
  labs(title = "Exercise hours distribution by region",
       x = "Exercise hours per week", y = "Density") +
  theme_minimal()
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_density()`).

# =============================
# Part 6: Scatterplot SugarDrinks vs Obesity
# =============================
ggplot(obesity_data, aes(x = SugarDrinks, y = Obesity, label = Region)) +
  geom_point(color="red", size=4) +
  geom_text(nudge_y = 0.5) +
  labs(title="Sugar-Sweetened Drinks vs Obesity",
       x="Sugar Drinks (ml/day)", y="Obesity (%)") +
  theme_minimal()

# =============================
# Part 7: Grouped bar chart Obesity & Diabetes
# =============================
grouped_long <- obesity_data %>%
  select(Region, Obesity, Diabetes) %>%
  pivot_longer(cols = c(Obesity, Diabetes),
               names_to = "Indicator", values_to = "Value")

ggplot(grouped_long, aes(x = Region, y = Value, fill = Indicator)) +
  geom_col(position = position_dodge(width = 0.8)) +
  labs(title = "Obesity and Diabetes by Region",
       x = "Region", y = "Percentage (%)", fill = "Indicator") +
  theme_minimal()

# =============================
# Part 8: Heatmap Vegetables vs Region
# =============================
ggplot(obesity_data, aes(x = Region, y = Vegetables, fill = Vegetables)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c(option = "plasma") +
  labs(title = "Vegetable consumption by region",
       x = "Region", y = "Vegetables (g/day)") +
  theme_minimal()