Data info : World Development Indicators

I used the World Development Indicators (WDI) by the World Bank, a comprehensive global dataset with nearly 1,500 economic, social, and environmental indicators for over 200 countries, covering several decades.

Load the WDI file

setwd("~/Documents/Rproject")

library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)

wdi <- read.csv("WDICSV.csv")

The questions regarding the WDI data were as follows:

  1. How has the world’s population growth rate changed over the past years?
  2. Which countries are experiencing the fastest growth?
# Using group_by, check the indicators and found there are 1,516 in total

nrow(wdi %>%
  group_by(Indicator.Code) %>%
  summarise(n = n())
  )
## [1] 1516
# To examine population growth, extract only the SP.POP.GROW code from the indicators

pop_growth <- subset(wdi, wdi$Indicator.Code=="SP.POP.GROW")

# From the data available since 1990, select the most recent 7 years

pop_growth <- pop_growth[, c("Country.Name", "Country.Code", "Indicator.Name", "Indicator.Code", "X2018","X2019","X2020","X2021","X2022", "X2023", "X2024")]

# Using count, confirm that there are 266 countries in the dataset

nrow(pop_growth %>% count(Country.Code)) #266 countries
## [1] 266
# For convenience in visualization and comparison, extract only the G20 countries

g20_code <- c("ARG","AUS","BRA","CAN","CHN","FRA","DEU","IND","IDN","ITA","JPN","MEX","KOR","RUS","SAU","ZAF","TUR","GBR","USA","EUU")

g20_pop_growth <- pop_growth %>% filter(Country.Code %in% g20_code)

summary(g20_pop_growth)
##  Country.Name       Country.Code       Indicator.Name     Indicator.Code    
##  Length:20          Length:20          Length:20          Length:20         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      X2018             X2019             X2020             X2021        
##  Min.   :-2.5538   Min.   :-0.7000   Min.   :-0.4871   Min.   :-2.4646  
##  1st Qu.: 0.2863   1st Qu.: 0.2016   1st Qu.: 0.1238   1st Qu.:-0.1326  
##  Median : 0.5374   Median : 0.5149   Median : 0.4465   Median : 0.2116  
##  Mean   : 0.5491   Mean   : 0.5743   Mean   : 0.6939   Mean   : 0.1490  
##  3rd Qu.: 1.0016   3rd Qu.: 0.9726   3rd Qu.: 0.9711   3rd Qu.: 0.5823  
##  Max.   : 1.6824   Max.   : 1.6496   Max.   : 4.8331   Max.   : 1.5406  
##      X2022             X2023             X2024         
##  Min.   :-0.4439   Min.   :-0.4874   Min.   :-0.46708  
##  1st Qu.: 0.1548   1st Qu.: 0.1123   1st Qu.: 0.05258  
##  Median : 0.6483   Median : 0.4012   Median : 0.37565  
##  Mean   : 0.7215   Mean   : 0.8560   Mean   : 0.79069  
##  3rd Qu.: 0.9371   3rd Qu.: 0.9887   3rd Qu.: 0.99881  
##  Max.   : 4.4189   Max.   : 4.6382   Max.   : 4.63120

Insight 1 : The average value in 2023 is high, and it decreased during the pandemic period in 2021

# Calculate the average growth rate by year

yrs_mean <- g20_pop_growth %>%
  summarise(across(c("X2018","X2019","X2020","X2021","X2022", "X2023", "X2024"), mean, na.rm = TRUE))

print(yrs_mean)
##       X2018     X2019     X2020     X2021     X2022    X2023     X2024
## 1 0.5490911 0.5742647 0.6938951 0.1489854 0.7215022 0.856019 0.7906855

Insight 2 : Saudi Arabia, Canada, South Africa, Australia, and India have the highest growth rates, in that order (Saudi Arabia may be an outlier)

# Calculate the 7-year average population growth rate by country

row_mean <- rowMeans(g20_pop_growth[, c("X2018","X2019","X2020","X2021","X2022", "X2023", "X2024")], na.rm = TRUE)
count_mean <- data.frame(
  Country.Name = g20_pop_growth$Country.Name,
  row_mean = row_mean) %>%
  arrange(desc(row_mean))

print(count_mean)
##          Country.Name   row_mean
## 1        Saudi Arabia  1.8662063
## 2              Canada  1.7434283
## 3        South Africa  1.4980363
## 4           Australia  1.4421237
## 5               India  0.9282281
## 6             Turkiye  0.8972410
## 7              Mexico  0.8386310
## 8           Indonesia  0.8374770
## 9      United Kingdom  0.6891008
## 10      United States  0.5787066
## 11             Brazil  0.5002460
## 12          Argentina  0.4468614
## 13             France  0.3372744
## 14     European Union  0.1551998
## 15            Germany  0.1468317
## 16              China  0.1299641
## 17        Korea, Rep.  0.1078306
## 18 Russian Federation -0.1740469
## 19              Italy -0.2440225
## 20              Japan -0.3411952

Insight 3 : There are 13 countries below the average, with a minimum value of -0.48%

# Divide countries into groups above and below the 2023 average, then compared the maximum and minimum population growth rates

grouped <- g20_pop_growth %>%
  mutate(pop_mean = mean(X2023, na.rm = TRUE)) %>%
  mutate(pop_group = ifelse(X2023 >= pop_mean, "Above Mean", "Below Mean")) %>%
  group_by(pop_group) %>%
  summarise(
    max = max(X2023, na.rm = TRUE),
    min = min(X2023, na.rm = TRUE),
    n = n()
    )

print(grouped)
## # A tibble: 2 × 4
##   pop_group    max    min     n
##   <chr>      <dbl>  <dbl> <int>
## 1 Above Mean 4.64   0.872     7
## 2 Below Mean 0.843 -0.487    13

Output : Line Graph using ggplot

df_long <- g20_pop_growth %>%
  select(-Country.Code, -Indicator.Name, -Indicator.Code) %>%
  pivot_longer(
    cols = c("X2018","X2019","X2020","X2021","X2022", "X2023", "X2024"),
    names_to = "year",
    values_to = "value"
  ) %>%
  mutate(year = as.numeric(sub("X", "", year)))

ggplot(df_long, aes(x = year, y = value, color = Country.Name)) +
  geom_line(size = 1) +
  labs(
    title = "Population Growth Rate of G20",
    x = "year",
    y = "value",
    color = "countries"
  ) +
  theme(
    strip.text = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )