Question: Is access to clean fuels and technologies for cooking an inverse indicator of prevalence of undernourishment?

Import data

# Load some useful libraries.
library(tidyverse) #includes dplyr, tibble, ggplot2, knitr
library(readr)
library(scales)
library(psych)
library(ggthemes)

# Import the data.
wdi <- read_csv("WDICSV.csv")
head(wdi)
## # A tibble: 6 × 69
##   `Country Name`  `Country Code` `Indicator Name` `Indicator Code` `1960` `1961`
##   <chr>           <chr>          <chr>            <chr>             <dbl>  <dbl>
## 1 Africa Eastern… AFE            Access to clean… EG.CFT.ACCS.ZS       NA     NA
## 2 Africa Eastern… AFE            Access to clean… EG.CFT.ACCS.RU.…     NA     NA
## 3 Africa Eastern… AFE            Access to clean… EG.CFT.ACCS.UR.…     NA     NA
## 4 Africa Eastern… AFE            Access to elect… EG.ELC.ACCS.ZS       NA     NA
## 5 Africa Eastern… AFE            Access to elect… EG.ELC.ACCS.RU.…     NA     NA
## 6 Africa Eastern… AFE            Access to elect… EG.ELC.ACCS.UR.…     NA     NA
## # ℹ 63 more variables: `1962` <dbl>, `1963` <dbl>, `1964` <dbl>, `1965` <dbl>,
## #   `1966` <dbl>, `1967` <dbl>, `1968` <dbl>, `1969` <dbl>, `1970` <dbl>,
## #   `1971` <dbl>, `1972` <dbl>, `1973` <dbl>, `1974` <dbl>, `1975` <dbl>,
## #   `1976` <dbl>, `1977` <dbl>, `1978` <dbl>, `1979` <dbl>, `1980` <dbl>,
## #   `1981` <dbl>, `1982` <dbl>, `1983` <dbl>, `1984` <dbl>, `1985` <dbl>,
## #   `1986` <dbl>, `1987` <dbl>, `1988` <dbl>, `1989` <dbl>, `1990` <dbl>,
## #   `1991` <dbl>, `1992` <dbl>, `1993` <dbl>, `1994` <dbl>, `1995` <dbl>, …

Format the data

# Keep the below Indicator Name/Code observations:
# Access to clean fuels and technologies for cooking (% of population) (Indicator Name)
# EG.CFT.ACCS.ZS (Indicator Code)
# Prevalence of undernourishment (% of population) (Indicator Name)
# SN.ITK.DEFC.ZS (Indicator Code)

# Filter by Indicator Code and select Country Name, Country Code, Indicator Name, Indicator Code, and the years 2000-2022
wdi_clean <- wdi %>%
  filter(`Indicator Code` == "EG.CFT.ACCS.ZS" | `Indicator Code` == "SN.ITK.DEFC.ZS") %>%
  select(`Country Name`, `Country Code`, `Indicator Name`, `Indicator Code`, `2000`, `2001`, `2002`, `2003`, `2004`, `2005`, `2006`, `2007`, `2008`, `2009`, `2010`, `2011`, `2012`, `2013`, `2014`, `2015`, `2016`, `2017`, `2018`, `2019`, `2020`, `2021`, `2022`)

# Group by the Indicator Name and summarize to compute the mean values for each year.
wdi_summary <- (wdi_clean %>% 
               group_by(`Indicator Name`) %>% 
               summarize(`2000` = mean(`2000`, na.rm = TRUE), `2001` = mean(`2001`, na.rm = TRUE), `2002` = mean(`2002`, na.rm = TRUE), `2003` = mean(`2003`, na.rm = TRUE), `2004` = mean(`2004`, na.rm = TRUE), `2005` = mean(`2005`, na.rm = TRUE), `2006` = mean(`2006`, na.rm = TRUE), `2007` = mean(`2007`, na.rm = TRUE), `2008` = mean(`2008`, na.rm = TRUE), `2009` = mean(`2009`, na.rm = TRUE), `2010` = mean(`2010`, na.rm = TRUE), `2011` = mean(`2011`, na.rm = TRUE), `2012` = mean(`2012`, na.rm = TRUE), `2013` = mean(`2013`, na.rm = TRUE), `2014` = mean(`2014`, na.rm = TRUE), `2015` = mean(`2015`, na.rm = TRUE), `2016` = mean(`2016`, na.rm = TRUE), `2017` = mean(`2017`, na.rm = TRUE), `2018` = mean(`2018`, na.rm = TRUE), `2019` = mean(`2019`, na.rm = TRUE), `2020` = mean(`2020`, na.rm = TRUE), `2021` = mean(`2021`, na.rm = TRUE), `2022` = mean(`2022`, na.rm = TRUE)))

# Pivot the data from wide to long.
wdi_summary_long <- wdi_summary %>% 
  gather(`2000`:`2022`, key="year", value="mean")
wdi_summary_long$year <- as.numeric(wdi_summary_long$year)

wdi_summary_2 <- wdi_summary_long %>%
  spread("Indicator Name", key = "Indicator Name", value = "mean")
# Reformat the data again to look at correlation.
wdi_summary_2 <- wdi_summary_long %>%
  spread("Indicator Name", key = "Indicator Name", value = "mean")

# Column data needs to be recast as numeric.
wdi_summary_2$`Access to clean fuels and technologies for cooking (% of population)` <- as.numeric(wdi_summary_2$`Access to clean fuels and technologies for cooking (% of population)`)
wdi_summary_2$`Prevalence of undernourishment (% of population)` <- as.numeric(wdi_summary_2$`Prevalence of undernourishment (% of population)`)
wdi_summary_2 <- na.omit(wdi_summary_2)

correlation <- round(cor(wdi_summary_2$`Access to clean fuels and technologies for cooking (% of population)`, wdi_summary_2$`Prevalence of undernourishment (% of population)`), 2)

print(paste("The correlation coefficent between these two variables is", correlation, "This is a very strong negative correlation, as we might expect."))
## [1] "The correlation coefficent between these two variables is -0.85 This is a very strong negative correlation, as we might expect."
wdi_summary_3 <- wdi_summary_2 %>%
  filter(year >= 2010)

correlation2 <- round(cor(wdi_summary_3$`Access to clean fuels and technologies for cooking (% of population)`, wdi_summary_3$`Prevalence of undernourishment (% of population)`), 2)

print(paste("If we restrict our time frame to just 2010-2022, however, the correlation coefficent between these two variables is", correlation2, "This is not what we might hope. Despite a continued increase in access to clean fuels and technologies for cooking since 2010, prevalence of undernourishment is on the rise."))
## [1] "If we restrict our time frame to just 2010-2022, however, the correlation coefficent between these two variables is 0.3 This is not what we might hope. Despite a continued increase in access to clean fuels and technologies for cooking since 2010, prevalence of undernourishment is on the rise."

Create a visual by plotting Access to clean fuels and technologies for cooking (% of population) and Prevalence of undernourishment (% of population) on the same graph.

We can see from the below that, while it seems like it would be helpful to have these on the same graph, this is actually not particularly insightful as the scaling requires the data to lose some nuance.

ggplot(wdi_summary_long, aes(year, mean)) + geom_line(aes(color = `Indicator Name`)) + 
  labs(x="Year", 
       y="Percentage of population", caption="Teresa Brown | Data Source: World Bank Group (2025)") +
  theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
        axis.line = element_line(color = "#999999"), axis.ticks.x = element_blank(), axis.ticks.y = element_blank())

Create a second visual by plotting each graph side by side.

Although these y-axis scales obviously do not match, this actually seems like a more insightful way to plot this information because the reader is able to see more nuance on each graph.

wdi_cooking <- wdi_summary_long %>%
  filter(`Indicator Name` == "Access to clean fuels and technologies for cooking (% of population)")

wdi_un <- wdi_summary_long %>%
  filter(`Indicator Name` == "Prevalence of undernourishment (% of population)")

plot_cooking <- ggplot(wdi_cooking, aes(year, mean)) + geom_line() + 
  labs(title= "Access for cooking, 2000-2022", x="Year", 
       y="Percentage of population", caption="Teresa Brown | Data Source: World Bank Group (2025)") +
  theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
        axis.line = element_line(color = "#999999"), axis.ticks.x = element_blank(), axis.ticks.y = element_blank())

plot_un <- ggplot(wdi_un, aes(year, mean)) + geom_line() + 
  labs(title="Undernourishment 2000-2022", x="Year", 
       y="Percentage of population", caption="Teresa Brown | Data Source: World Bank Group (2025)") +
  theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
        axis.line = element_line(color = "#999999"), axis.ticks.x = element_blank(), axis.ticks.y = element_blank())

library(cowplot)
cowplot::plot_grid(plot_cooking, plot_un, labels = "AUTO")