1) Topic & Project Title

Project Title: Macroeconomic Drivers of Youth Unemployment

Topic: This project explores how macroeconomic factors-GDP per capita, inflation, and government education expenditure—relate to youth unemployment (ages 15–24) across countries over time.

2) Ultimate Goal & Research Questions

Ultimate Goal: Understand patterns and relationships between macro indicators and youth unemployment to inform future analysis and visualization (e.g., country benchmarking, regression, forecasting).

Research Questions:

  1. How has youth unemployment changed over the past decade across countries?
  2. Is youth unemployment correlated with GDP per capita levels and growth?
  3. How does inflation relate to youth unemployment over time?
  4. Is higher education expenditure (as % of GDP) associated with lower youth unemployment?
  5. Which countries show the strongest/weakest improvements and what macro patterns accompany them?

3) Data: Sources & Extraction

We use the World Bank Open Data via the WDI R package (no API key required).

Indicators used:

# Install if needed:
# install.packages(c("WDI", "dplyr", "tidyr", "janitor", "ggplot2", "zoo"), repos = "https://cloud.r-project.org")

suppressPackageStartupMessages({
  library(WDI)
  library(dplyr)
  library(tidyr)
  library(janitor)
  library(ggplot2)
  library(zoo)
})
countries <- c("US", "GB", "ES", "DE", "JP", "BR")  # USA, UK, Spain, Germany, Japan, Brazil
start_year <- as.integer(format(Sys.Date(), "%Y")) - 11  # ~ last 12 years
end_year   <- as.integer(format(Sys.Date(), "%Y"))

indicators <- c(
  "SL.UEM.1524.ZS",    # youth unemployment (%)
  "NY.GDP.PCAP.CD",    # GDP per capita (USD)
  "FP.CPI.TOTL.ZG",    # inflation (annual %)
  "SE.XPD.TOTL.GD.ZS"  # education spend (% of GDP)
)

indicator_names <- c(
  "SL.UEM.1524.ZS" = "Youth Unemployment (%)",
  "NY.GDP.PCAP.CD" = "GDP per Capita (USD)",
  "FP.CPI.TOTL.ZG" = "Inflation (CPI, %)",
  "SE.XPD.TOTL.GD.ZS" = "Education Expenditure (% of GDP)"
)
raw_wdi <- WDI(
  country = countries,
  indicator = indicators,
  start = start_year,
  end   = end_year,
  extra = TRUE,
  cache = NULL
)

cat("Raw WDI rows:", nrow(raw_wdi), "\n")
## Raw WDI rows: 66
head(raw_wdi, 10)
##    country iso2c iso3c year status lastupdated SL.UEM.1524.ZS NY.GDP.PCAP.CD
## 1   Brazil    BR   BRA 2014         2025-10-07         15.496      12274.994
## 2   Brazil    BR   BRA 2015         2025-10-07         19.493       8936.197
## 3   Brazil    BR   BRA 2016         2025-10-07         26.596       8836.287
## 4   Brazil    BR   BRA 2017         2025-10-07         28.594      10080.509
## 5   Brazil    BR   BRA 2018         2025-10-07         27.958       9300.662
## 6   Brazil    BR   BRA 2019         2025-10-07         27.098       9029.833
## 7   Brazil    BR   BRA 2020         2025-10-07         30.267       7074.194
## 8   Brazil    BR   BRA 2021         2025-10-07         28.311       7972.537
## 9   Brazil    BR   BRA 2022         2025-10-07         20.732       9281.333
## 10  Brazil    BR   BRA 2023         2025-10-07         17.938      10377.589
##    FP.CPI.TOTL.ZG SE.XPD.TOTL.GD.ZS                    region  capital
## 1        6.329040           5.94848 Latin America & Caribbean Brasilia
## 2        9.029901           6.24106 Latin America & Caribbean Brasilia
## 3        8.739144           6.31404 Latin America & Caribbean Brasilia
## 4        3.446373           6.32048 Latin America & Caribbean Brasilia
## 5        3.664850           6.08851 Latin America & Caribbean Brasilia
## 6        3.732976           5.96347 Latin America & Caribbean Brasilia
## 7        3.211768           5.77150 Latin America & Caribbean Brasilia
## 8        8.301660           5.49698 Latin America & Caribbean Brasilia
## 9        9.280106           5.61923 Latin America & Caribbean Brasilia
## 10       4.593563                NA Latin America & Caribbean Brasilia
##    longitude latitude              income lending
## 1   -47.9292 -15.7801 Upper middle income    IBRD
## 2   -47.9292 -15.7801 Upper middle income    IBRD
## 3   -47.9292 -15.7801 Upper middle income    IBRD
## 4   -47.9292 -15.7801 Upper middle income    IBRD
## 5   -47.9292 -15.7801 Upper middle income    IBRD
## 6   -47.9292 -15.7801 Upper middle income    IBRD
## 7   -47.9292 -15.7801 Upper middle income    IBRD
## 8   -47.9292 -15.7801 Upper middle income    IBRD
## 9   -47.9292 -15.7801 Upper middle income    IBRD
## 10  -47.9292 -15.7801 Upper middle income    IBRD
wdi <- raw_wdi %>%
  clean_names() %>%
  rename(
    country_code = iso2c,
    country_name = country,
    year         = year,
    youth_unemp  = sl_uem_1524_zs,
    gdp_pc       = ny_gdp_pcap_cd,
    inflation    = fp_cpi_totl_zg,
    edu_spend    = se_xpd_totl_gd_zs
  ) %>%
  select(country_code, country_name, region, income, year, youth_unemp, gdp_pc, inflation, edu_spend) %>%
  arrange(country_code, year)

# Simple time-series imputation helpers
impute_series <- function(x) {
  if (all(is.na(x))) return(x)
  x <- zoo::na.approx(x, na.rm = FALSE)
  x <- zoo::na.locf(x, na.rm = FALSE)
  x <- zoo::na.locf(x, na.rm = FALSE, fromLast = TRUE)
  x
}

wdi_imputed <- wdi %>%
  group_by(country_code) %>%
  mutate(
    youth_unemp = impute_series(youth_unemp),
    gdp_pc      = impute_series(gdp_pc),
    inflation   = impute_series(inflation),
    edu_spend   = impute_series(edu_spend)
  ) %>%
  ungroup()

wdi_feat <- wdi_imputed %>%
  group_by(country_code) %>%
  arrange(year, .by_group = TRUE) %>%
  mutate(
    gdp_pc_growth = 100 * (gdp_pc / dplyr::lag(gdp_pc) - 1)
  ) %>%
  ungroup()

head(wdi_feat, 10)
## # A tibble: 10 × 10
##    country_code country_name region    income  year youth_unemp gdp_pc inflation
##    <chr>        <chr>        <chr>     <chr>  <int>       <dbl>  <dbl>     <dbl>
##  1 BR           Brazil       Latin Am… Upper…  2014        15.5 12275.      6.33
##  2 BR           Brazil       Latin Am… Upper…  2015        19.5  8936.      9.03
##  3 BR           Brazil       Latin Am… Upper…  2016        26.6  8836.      8.74
##  4 BR           Brazil       Latin Am… Upper…  2017        28.6 10081.      3.45
##  5 BR           Brazil       Latin Am… Upper…  2018        28.0  9301.      3.66
##  6 BR           Brazil       Latin Am… Upper…  2019        27.1  9030.      3.73
##  7 BR           Brazil       Latin Am… Upper…  2020        30.3  7074.      3.21
##  8 BR           Brazil       Latin Am… Upper…  2021        28.3  7973.      8.30
##  9 BR           Brazil       Latin Am… Upper…  2022        20.7  9281.      9.28
## 10 BR           Brazil       Latin Am… Upper…  2023        17.9 10378.      4.59
## # ℹ 2 more variables: edu_spend <dbl>, gdp_pc_growth <dbl>
desc_by_country <- wdi_feat %>%
  group_by(country_code, country_name) %>%
  summarize(
    years = paste0(min(year, na.rm = TRUE), "-", max(year, na.rm = TRUE)),
    obs   = dplyr::n(),
    youth_unemp_avg = mean(youth_unemp, na.rm = TRUE),
    youth_unemp_min = min(youth_unemp, na.rm = TRUE),
    youth_unemp_max = max(youth_unemp, na.rm = TRUE),
    gdp_pc_avg      = mean(gdp_pc, na.rm = TRUE),
    infl_avg        = mean(inflation, na.rm = TRUE),
    edu_spend_avg   = mean(edu_spend, na.rm = TRUE),
    gdp_pc_g_avg    = mean(gdp_pc_growth, na.rm = TRUE)
  ) %>%
  arrange(country_code)

desc_by_country
## # A tibble: 6 × 11
## # Groups:   country_code [6]
##   country_code country_name   years       obs youth_unemp_avg youth_unemp_min
##   <chr>        <chr>          <chr>     <int>           <dbl>           <dbl>
## 1 BR           Brazil         2014-2024    11           23.7            15.5 
## 2 DE           Germany        2014-2024    11            6.73            5.86
## 3 ES           Spain          2014-2024    11           37.3            27.0 
## 4 GB           United Kingdom 2014-2024    11           12.8            10.5 
## 5 JP           Japan          2014-2024    11            4.62            3.67
## 6 US           United States  2014-2024    11           10.2             7.95
## # ℹ 5 more variables: youth_unemp_max <dbl>, gdp_pc_avg <dbl>, infl_avg <dbl>,
## #   edu_spend_avg <dbl>, gdp_pc_g_avg <dbl>
corr_vars <- wdi_feat %>%
  select(youth_unemp, gdp_pc, gdp_pc_growth, inflation, edu_spend) %>%
  filter(complete.cases(.))

round(cor(corr_vars), 3)
##               youth_unemp gdp_pc gdp_pc_growth inflation edu_spend
## youth_unemp         1.000 -0.564        -0.054     0.084     0.198
## gdp_pc             -0.564  1.000         0.243    -0.198    -0.056
## gdp_pc_growth      -0.054  0.243         1.000     0.130     0.066
## inflation           0.084 -0.198         0.130     1.000     0.522
## edu_spend           0.198 -0.056         0.066     0.522     1.000
ggplot(wdi_feat, aes(x = year, y = youth_unemp, color = country_code, group = country_code)) +
  geom_line(linewidth = 1) +
  geom_point(size = 1.5) +
  labs(
    title = "Youth Unemployment (15–24) Over Time",
    y = indicator_names["SL.UEM.1524.ZS"], x = "Year", color = "Country"
  ) +
  theme_minimal(base_size = 12)

ggplot(wdi_feat, aes(x = gdp_pc, y = youth_unemp, color = country_code)) +
  geom_point(alpha = 0.7) +
  scale_x_log10(labels = scales::dollar) +
  labs(
    title = "Youth Unemployment vs GDP per Capita",
    x = indicator_names["NY.GDP.PCAP.CD"], y = indicator_names["SL.UEM.1524.ZS"], color = "Country"
  ) +
  theme_minimal(base_size = 12)

ggplot(wdi_feat, aes(x = edu_spend, y = youth_unemp, color = country_code)) +
  geom_point(alpha = 0.7) +
  labs(
    title = "Youth Unemployment vs Education Expenditure (% of GDP)",
    x = indicator_names["SE.XPD.TOTL.GD.ZS"], y = indicator_names["SL.UEM.1524.ZS"], color = "Country"
  ) +
  theme_minimal(base_size = 12)

ggplot(wdi_feat, aes(x = inflation, y = youth_unemp, color = country_code)) +
  geom_point(alpha = 0.7) +
  labs(
    title = "Youth Unemployment vs Inflation",
    x = indicator_names["FP.CPI.TOTL.ZG"], y = indicator_names["SL.UEM.1524.ZS"], color = "Country"
  ) +
  theme_minimal(base_size = 12)

# write.csv(wdi_feat, "youth_unemployment_macro_clean.csv", row.names = FALSE)