Project Title: Macroeconomic Drivers of Youth Unemployment
Topic: This project explores how macroeconomic factors-GDP per capita, inflation, and government education expenditure—relate to youth unemployment (ages 15–24) across countries over time.
Ultimate Goal: Understand patterns and relationships between macro indicators and youth unemployment to inform future analysis and visualization (e.g., country benchmarking, regression, forecasting).
Research Questions:
We use the World Bank Open Data via the
WDI R package (no API key required).
Indicators used:
SL.UEM.1524.ZS — Unemployment, youth total (% of total
labor force ages 15–24)NY.GDP.PCAP.CD — GDP per capita (current US$)FP.CPI.TOTL.ZG — Inflation, consumer prices (annual
%)SE.XPD.TOTL.GD.ZS — Government expenditure on
education, total (% of GDP)# Install if needed:
# install.packages(c("WDI", "dplyr", "tidyr", "janitor", "ggplot2", "zoo"), repos = "https://cloud.r-project.org")
suppressPackageStartupMessages({
library(WDI)
library(dplyr)
library(tidyr)
library(janitor)
library(ggplot2)
library(zoo)
})
countries <- c("US", "GB", "ES", "DE", "JP", "BR") # USA, UK, Spain, Germany, Japan, Brazil
start_year <- as.integer(format(Sys.Date(), "%Y")) - 11 # ~ last 12 years
end_year <- as.integer(format(Sys.Date(), "%Y"))
indicators <- c(
"SL.UEM.1524.ZS", # youth unemployment (%)
"NY.GDP.PCAP.CD", # GDP per capita (USD)
"FP.CPI.TOTL.ZG", # inflation (annual %)
"SE.XPD.TOTL.GD.ZS" # education spend (% of GDP)
)
indicator_names <- c(
"SL.UEM.1524.ZS" = "Youth Unemployment (%)",
"NY.GDP.PCAP.CD" = "GDP per Capita (USD)",
"FP.CPI.TOTL.ZG" = "Inflation (CPI, %)",
"SE.XPD.TOTL.GD.ZS" = "Education Expenditure (% of GDP)"
)
raw_wdi <- WDI(
country = countries,
indicator = indicators,
start = start_year,
end = end_year,
extra = TRUE,
cache = NULL
)
cat("Raw WDI rows:", nrow(raw_wdi), "\n")
## Raw WDI rows: 66
head(raw_wdi, 10)
## country iso2c iso3c year status lastupdated SL.UEM.1524.ZS NY.GDP.PCAP.CD
## 1 Brazil BR BRA 2014 2025-10-07 15.496 12274.994
## 2 Brazil BR BRA 2015 2025-10-07 19.493 8936.197
## 3 Brazil BR BRA 2016 2025-10-07 26.596 8836.287
## 4 Brazil BR BRA 2017 2025-10-07 28.594 10080.509
## 5 Brazil BR BRA 2018 2025-10-07 27.958 9300.662
## 6 Brazil BR BRA 2019 2025-10-07 27.098 9029.833
## 7 Brazil BR BRA 2020 2025-10-07 30.267 7074.194
## 8 Brazil BR BRA 2021 2025-10-07 28.311 7972.537
## 9 Brazil BR BRA 2022 2025-10-07 20.732 9281.333
## 10 Brazil BR BRA 2023 2025-10-07 17.938 10377.589
## FP.CPI.TOTL.ZG SE.XPD.TOTL.GD.ZS region capital
## 1 6.329040 5.94848 Latin America & Caribbean Brasilia
## 2 9.029901 6.24106 Latin America & Caribbean Brasilia
## 3 8.739144 6.31404 Latin America & Caribbean Brasilia
## 4 3.446373 6.32048 Latin America & Caribbean Brasilia
## 5 3.664850 6.08851 Latin America & Caribbean Brasilia
## 6 3.732976 5.96347 Latin America & Caribbean Brasilia
## 7 3.211768 5.77150 Latin America & Caribbean Brasilia
## 8 8.301660 5.49698 Latin America & Caribbean Brasilia
## 9 9.280106 5.61923 Latin America & Caribbean Brasilia
## 10 4.593563 NA Latin America & Caribbean Brasilia
## longitude latitude income lending
## 1 -47.9292 -15.7801 Upper middle income IBRD
## 2 -47.9292 -15.7801 Upper middle income IBRD
## 3 -47.9292 -15.7801 Upper middle income IBRD
## 4 -47.9292 -15.7801 Upper middle income IBRD
## 5 -47.9292 -15.7801 Upper middle income IBRD
## 6 -47.9292 -15.7801 Upper middle income IBRD
## 7 -47.9292 -15.7801 Upper middle income IBRD
## 8 -47.9292 -15.7801 Upper middle income IBRD
## 9 -47.9292 -15.7801 Upper middle income IBRD
## 10 -47.9292 -15.7801 Upper middle income IBRD
wdi <- raw_wdi %>%
clean_names() %>%
rename(
country_code = iso2c,
country_name = country,
year = year,
youth_unemp = sl_uem_1524_zs,
gdp_pc = ny_gdp_pcap_cd,
inflation = fp_cpi_totl_zg,
edu_spend = se_xpd_totl_gd_zs
) %>%
select(country_code, country_name, region, income, year, youth_unemp, gdp_pc, inflation, edu_spend) %>%
arrange(country_code, year)
# Simple time-series imputation helpers
impute_series <- function(x) {
if (all(is.na(x))) return(x)
x <- zoo::na.approx(x, na.rm = FALSE)
x <- zoo::na.locf(x, na.rm = FALSE)
x <- zoo::na.locf(x, na.rm = FALSE, fromLast = TRUE)
x
}
wdi_imputed <- wdi %>%
group_by(country_code) %>%
mutate(
youth_unemp = impute_series(youth_unemp),
gdp_pc = impute_series(gdp_pc),
inflation = impute_series(inflation),
edu_spend = impute_series(edu_spend)
) %>%
ungroup()
wdi_feat <- wdi_imputed %>%
group_by(country_code) %>%
arrange(year, .by_group = TRUE) %>%
mutate(
gdp_pc_growth = 100 * (gdp_pc / dplyr::lag(gdp_pc) - 1)
) %>%
ungroup()
head(wdi_feat, 10)
## # A tibble: 10 × 10
## country_code country_name region income year youth_unemp gdp_pc inflation
## <chr> <chr> <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 BR Brazil Latin Am… Upper… 2014 15.5 12275. 6.33
## 2 BR Brazil Latin Am… Upper… 2015 19.5 8936. 9.03
## 3 BR Brazil Latin Am… Upper… 2016 26.6 8836. 8.74
## 4 BR Brazil Latin Am… Upper… 2017 28.6 10081. 3.45
## 5 BR Brazil Latin Am… Upper… 2018 28.0 9301. 3.66
## 6 BR Brazil Latin Am… Upper… 2019 27.1 9030. 3.73
## 7 BR Brazil Latin Am… Upper… 2020 30.3 7074. 3.21
## 8 BR Brazil Latin Am… Upper… 2021 28.3 7973. 8.30
## 9 BR Brazil Latin Am… Upper… 2022 20.7 9281. 9.28
## 10 BR Brazil Latin Am… Upper… 2023 17.9 10378. 4.59
## # ℹ 2 more variables: edu_spend <dbl>, gdp_pc_growth <dbl>
desc_by_country <- wdi_feat %>%
group_by(country_code, country_name) %>%
summarize(
years = paste0(min(year, na.rm = TRUE), "-", max(year, na.rm = TRUE)),
obs = dplyr::n(),
youth_unemp_avg = mean(youth_unemp, na.rm = TRUE),
youth_unemp_min = min(youth_unemp, na.rm = TRUE),
youth_unemp_max = max(youth_unemp, na.rm = TRUE),
gdp_pc_avg = mean(gdp_pc, na.rm = TRUE),
infl_avg = mean(inflation, na.rm = TRUE),
edu_spend_avg = mean(edu_spend, na.rm = TRUE),
gdp_pc_g_avg = mean(gdp_pc_growth, na.rm = TRUE)
) %>%
arrange(country_code)
desc_by_country
## # A tibble: 6 × 11
## # Groups: country_code [6]
## country_code country_name years obs youth_unemp_avg youth_unemp_min
## <chr> <chr> <chr> <int> <dbl> <dbl>
## 1 BR Brazil 2014-2024 11 23.7 15.5
## 2 DE Germany 2014-2024 11 6.73 5.86
## 3 ES Spain 2014-2024 11 37.3 27.0
## 4 GB United Kingdom 2014-2024 11 12.8 10.5
## 5 JP Japan 2014-2024 11 4.62 3.67
## 6 US United States 2014-2024 11 10.2 7.95
## # ℹ 5 more variables: youth_unemp_max <dbl>, gdp_pc_avg <dbl>, infl_avg <dbl>,
## # edu_spend_avg <dbl>, gdp_pc_g_avg <dbl>
corr_vars <- wdi_feat %>%
select(youth_unemp, gdp_pc, gdp_pc_growth, inflation, edu_spend) %>%
filter(complete.cases(.))
round(cor(corr_vars), 3)
## youth_unemp gdp_pc gdp_pc_growth inflation edu_spend
## youth_unemp 1.000 -0.564 -0.054 0.084 0.198
## gdp_pc -0.564 1.000 0.243 -0.198 -0.056
## gdp_pc_growth -0.054 0.243 1.000 0.130 0.066
## inflation 0.084 -0.198 0.130 1.000 0.522
## edu_spend 0.198 -0.056 0.066 0.522 1.000
ggplot(wdi_feat, aes(x = year, y = youth_unemp, color = country_code, group = country_code)) +
geom_line(linewidth = 1) +
geom_point(size = 1.5) +
labs(
title = "Youth Unemployment (15–24) Over Time",
y = indicator_names["SL.UEM.1524.ZS"], x = "Year", color = "Country"
) +
theme_minimal(base_size = 12)
ggplot(wdi_feat, aes(x = gdp_pc, y = youth_unemp, color = country_code)) +
geom_point(alpha = 0.7) +
scale_x_log10(labels = scales::dollar) +
labs(
title = "Youth Unemployment vs GDP per Capita",
x = indicator_names["NY.GDP.PCAP.CD"], y = indicator_names["SL.UEM.1524.ZS"], color = "Country"
) +
theme_minimal(base_size = 12)
ggplot(wdi_feat, aes(x = edu_spend, y = youth_unemp, color = country_code)) +
geom_point(alpha = 0.7) +
labs(
title = "Youth Unemployment vs Education Expenditure (% of GDP)",
x = indicator_names["SE.XPD.TOTL.GD.ZS"], y = indicator_names["SL.UEM.1524.ZS"], color = "Country"
) +
theme_minimal(base_size = 12)
ggplot(wdi_feat, aes(x = inflation, y = youth_unemp, color = country_code)) +
geom_point(alpha = 0.7) +
labs(
title = "Youth Unemployment vs Inflation",
x = indicator_names["FP.CPI.TOTL.ZG"], y = indicator_names["SL.UEM.1524.ZS"], color = "Country"
) +
theme_minimal(base_size = 12)
# write.csv(wdi_feat, "youth_unemployment_macro_clean.csv", row.names = FALSE)