Load required packages
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Load the World Bank Dataset
#fill '..' values in numerical columns with NA.
world_bank <- read_csv("C:/Users/SP KHALID/Downloads/WDI- World Bank Dataset.csv" , na = c('..'))
## Rows: 1675 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Time Code, Country Name, Country Code, Region, Income Group
## dbl (14): Time, GDP (constant 2015 US$), GDP growth (annual %), GDP (current...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
world_bank
## # A tibble: 1,675 × 19
## Time `Time Code` `Country Name` `Country Code` Region `Income Group`
## <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 2000 YR2000 Brazil BRA Latin America… Upper middle …
## 2 2000 YR2000 China CHN East Asia & P… Upper middle …
## 3 2000 YR2000 France FRA Europe & Cent… High income
## 4 2000 YR2000 Germany DEU Europe & Cent… High income
## 5 2000 YR2000 India IND South Asia Lower middle …
## 6 2000 YR2000 Indonesia IDN East Asia & P… Upper middle …
## 7 2000 YR2000 Italy ITA Europe & Cent… High income
## 8 2000 YR2000 Japan JPN East Asia & P… High income
## 9 2000 YR2000 Korea, Rep. KOR East Asia & P… High income
## 10 2000 YR2000 Mexico MEX Latin America… Upper middle …
## # ℹ 1,665 more rows
## # ℹ 13 more variables: `GDP (constant 2015 US$)` <dbl>,
## # `GDP growth (annual %)` <dbl>, `GDP (current US$)` <dbl>,
## # `Unemployment, total (% of total labor force)` <dbl>,
## # `Inflation, consumer prices (annual %)` <dbl>, `Labor force, total` <dbl>,
## # `Population, total` <dbl>,
## # `Exports of goods and services (% of GDP)` <dbl>, …
dim(world_bank)
## [1] 1675 19
# Check column data types
glimpse(world_bank)
## Rows: 1,675
## Columns: 19
## $ Time <dbl> 2000, 20…
## $ `Time Code` <chr> "YR2000"…
## $ `Country Name` <chr> "Brazil"…
## $ `Country Code` <chr> "BRA", "…
## $ Region <chr> "Latin A…
## $ `Income Group` <chr> "Upper m…
## $ `GDP (constant 2015 US$)` <dbl> 1.18642e…
## $ `GDP growth (annual %)` <dbl> 4.387949…
## $ `GDP (current US$)` <dbl> 6.554482…
## $ `Unemployment, total (% of total labor force)` <dbl> NA, 3.70…
## $ `Inflation, consumer prices (annual %)` <dbl> 7.044141…
## $ `Labor force, total` <dbl> 80295093…
## $ `Population, total` <dbl> 17401828…
## $ `Exports of goods and services (% of GDP)` <dbl> 10.18805…
## $ `Imports of goods and services (% of GDP)` <dbl> 12.45171…
## $ `General government final consumption expenditure (% of GDP)` <dbl> 18.76784…
## $ `Foreign direct investment, net inflows (% of GDP)` <dbl> 5.033917…
## $ `Gross savings (% of GDP)` <dbl> 13.99170…
## $ `Current account balance (% of GDP)` <dbl> -4.04774…
# Convert Time column to integer
world_bank$Time <- as.integer(world_bank$Time)
# Verify Datatypes
glimpse(world_bank)
## Rows: 1,675
## Columns: 19
## $ Time <int> 2000, 20…
## $ `Time Code` <chr> "YR2000"…
## $ `Country Name` <chr> "Brazil"…
## $ `Country Code` <chr> "BRA", "…
## $ Region <chr> "Latin A…
## $ `Income Group` <chr> "Upper m…
## $ `GDP (constant 2015 US$)` <dbl> 1.18642e…
## $ `GDP growth (annual %)` <dbl> 4.387949…
## $ `GDP (current US$)` <dbl> 6.554482…
## $ `Unemployment, total (% of total labor force)` <dbl> NA, 3.70…
## $ `Inflation, consumer prices (annual %)` <dbl> 7.044141…
## $ `Labor force, total` <dbl> 80295093…
## $ `Population, total` <dbl> 17401828…
## $ `Exports of goods and services (% of GDP)` <dbl> 10.18805…
## $ `Imports of goods and services (% of GDP)` <dbl> 12.45171…
## $ `General government final consumption expenditure (% of GDP)` <dbl> 18.76784…
## $ `Foreign direct investment, net inflows (% of GDP)` <dbl> 5.033917…
## $ `Gross savings (% of GDP)` <dbl> 13.99170…
## $ `Current account balance (% of GDP)` <dbl> -4.04774…
world_bank |>
select(where(is.numeric)) |>
summary()
## Time GDP (constant 2015 US$) GDP growth (annual %)
## Min. :2000 Min. :1.649e+09 Min. :-30.145
## 1st Qu.:2006 1st Qu.:3.376e+10 1st Qu.: 1.888
## Median :2012 Median :2.160e+11 Median : 3.950
## Mean :2012 Mean :9.444e+11 Mean : 3.899
## 3rd Qu.:2018 3rd Qu.:7.585e+11 3rd Qu.: 6.050
## Max. :2024 Max. :2.257e+13 Max. : 34.466
## NA's :1 NA's :2
## GDP (current US$) Unemployment, total (% of total labor force)
## Min. :6.359e+08 Min. : 0.200
## 1st Qu.:2.468e+10 1st Qu.: 3.741
## Median :1.915e+11 Median : 5.600
## Mean :9.409e+11 Mean : 6.826
## 3rd Qu.:7.091e+11 3rd Qu.: 8.711
## Max. :2.875e+13 Max. :34.007
## NA's :1 NA's :414
## Inflation, consumer prices (annual %) Labor force, total Population, total
## Min. : -8.975 Min. : 1277880 Min. :2.402e+06
## 1st Qu.: 1.851 1st Qu.: 4932834 1st Qu.:1.109e+07
## Median : 3.826 Median : 11764216 Median :2.930e+07
## Mean : 6.367 Mean : 41128373 Mean :8.904e+07
## 3rd Qu.: 7.849 3rd Qu.: 30480443 3rd Qu.:7.070e+07
## Max. :219.884 Max. :781187865 Max. :1.451e+09
## NA's :61
## Exports of goods and services (% of GDP)
## Min. : 3.401
## 1st Qu.: 18.479
## Median : 27.923
## Mean : 33.755
## 3rd Qu.: 39.495
## Max. :228.994
## NA's :82
## Imports of goods and services (% of GDP)
## Min. : 9.099
## 1st Qu.: 23.173
## Median : 29.879
## Mean : 35.846
## 3rd Qu.: 39.961
## Max. :208.333
## NA's :82
## General government final consumption expenditure (% of GDP)
## Min. : 4.766
## 1st Qu.:11.162
## Median :14.756
## Mean :14.919
## 3rd Qu.:18.543
## Max. :28.915
## NA's :100
## Foreign direct investment, net inflows (% of GDP) Gross savings (% of GDP)
## Min. :-32.151 Min. :-9.492
## 1st Qu.: 1.014 1st Qu.:17.092
## Median : 2.348 Median :22.641
## Mean : 3.831 Mean :23.644
## 3rd Qu.: 4.051 3rd Qu.:29.646
## Max. :103.337 Max. :52.752
## NA's :1 NA's :192
## Current account balance (% of GDP)
## Min. :-59.996
## 1st Qu.: -4.557
## Median : -1.595
## Mean : -1.336
## 3rd Qu.: 2.151
## Max. : 33.679
## NA's :126
library(dplyr)
world_bank |>
summarise(
across(
where(~ is.character(.) | is.factor(.)),
~ n_distinct(.)
)
)
## # A tibble: 1 × 5
## `Time Code` `Country Name` `Country Code` Region `Income Group`
## <int> <int> <int> <int> <int>
## 1 25 67 67 7 4
income_group_summary <- world_bank |>
group_by(`Country Code`) |>
slice_max(Time, n = 1, with_ties = FALSE) |>
ungroup() |>
count(`Income Group`, name = "No of Countries")
income_group_summary
## # A tibble: 4 × 2
## `Income Group` `No of Countries`
## <chr> <int>
## 1 High income 24
## 2 Low income 12
## 3 Lower middle income 15
## 4 Upper middle income 16
region_summary <- world_bank |>
group_by(`Country Code`) |>
slice_max(Time, n = 1, with_ties = FALSE) |>
ungroup() |>
count(Region, name = "No of Countries")
region_summary
## # A tibble: 7 × 2
## Region `No of Countries`
## <chr> <int>
## 1 East Asia & Pacific 11
## 2 Europe & Central Asia 18
## 3 Latin America & Caribbean 9
## 4 Middle East & North Africa 8
## 5 North America 2
## 6 South Asia 4
## 7 Sub-Saharan Africa 15
Larger economies like US and China have shown considerably shown higher GDP constant values over the years due to greater capital, technology, and productivity advantages.
q1 <- world_bank |>
group_by(`Country Name`) |>
summarise(
Avg_GDP_Constant_2015_Trillion_USD = mean(`GDP (constant 2015 US$)`, na.rm = TRUE)/1e12,
Avg_GDP_Growth = mean(`GDP growth (annual %)`, na.rm = TRUE),
Avg_Population = mean(`Population, total`, na.rm = TRUE)
) |>
arrange(desc(Avg_Population)) |>
slice_head(n = 5) |>
select(`Country Name`, Avg_Population, Avg_GDP_Constant_2015_Trillion_USD, Avg_GDP_Growth)
q1
## # A tibble: 5 × 4
## `Country Name` Avg_Population Avg_GDP_Constant_2015_Trillion_…¹ Avg_GDP_Growth
## <chr> <dbl> <dbl> <dbl>
## 1 China 1351585200 9.59 8.19
## 2 India 1270238266. 1.86 6.26
## 3 United States 312853941. 17.5 2.21
## 4 Indonesia 251600086. 0.762 4.89
## 5 Pakistan 204426074. 0.280 3.97
## # ℹ abbreviated name: ¹​Avg_GDP_Constant_2015_Trillion_USD
For instance, if we look the timely data for US, we can see direct proptionality between GDP constant and year.
q2 <- world_bank |>
filter(`Country Code` == "USA") |>
group_by(Time) |>
summarise(
Avg_GDP_Trillion_USD = mean(`GDP (current US$)`,na.rm = TRUE)/1e12,
.groups = "drop") |>
arrange(Time)
q2
## # A tibble: 25 × 2
## Time Avg_GDP_Trillion_USD
## <int> <dbl>
## 1 2000 10.3
## 2 2001 10.6
## 3 2002 10.9
## 4 2003 11.5
## 5 2004 12.2
## 6 2005 13.0
## 7 2006 13.8
## 8 2007 14.5
## 9 2008 14.8
## 10 2009 14.5
## # ℹ 15 more rows
The Average export values for all of the groups are relatively comparable, specially the upper and lower middle income group. One of the reasons is the manufacturing potential in those countries like singapore, china and vietnam and cheap labor costs.
q3 <- world_bank |>
group_by(`Income Group`) |>
summarise(Avg_Exports_Percent_GDP = mean(`Exports of goods and services (% of GDP)`, na.rm = TRUE))
q3
## # A tibble: 4 × 2
## `Income Group` Avg_Exports_Percent_GDP
## <chr> <dbl>
## 1 High income 44.4
## 2 Low income 19.1
## 3 Lower middle income 24.6
## 4 Upper middle income 34.5
# Prepare data for scatter plot- mean of columns
scatter_data <- world_bank |>
group_by(`Country Name`, `Income Group`) |>
summarise(
Avg_GDP_Growth = mean(`GDP growth (annual %)`,na.rm = TRUE),
GDP_Constant_2015 = mean(`GDP (constant 2015 US$)`,na.rm = TRUE),
Population = mean(`Population, total`, na.rm = TRUE),
.groups = "drop"
)
ggplot(scatter_data, aes(x = Avg_GDP_Growth, y = GDP_Constant_2015, color = `Income Group`, size = Population)) +
geom_point(alpha = 0.6) +
labs(
title = "GDP Level vs Average GDP Growth",
x = "Average GDP Growth (Annual %)",
y = "GDP (Constant 2015 US$)",
size = "Population",
color = "Income Group"
) +
theme_minimal()
However, even with higher GDP constant values, the average annual GDP growth has fluctuated for US over the years due to business cycles, shifts in consumer demand, and changes in monetary and fiscal policy.
# Prepare US time series data
us_gdp_growth <- world_bank |>
filter(`Country Code` == "USA") |>
select(Time,`GDP growth (annual %)`) |>
arrange(Time)
ggplot(us_gdp_growth, aes(x = Time,y = `GDP growth (annual %)`)) +
geom_line(linewidth = 1) +
geom_point(alpha = 0.7) +
labs(
title = "United States: GDP Growth Rate Over Time",
x = "Year",
y = "GDP Growth (Annual %)"
) +
theme_minimal()
# Prepare export data
exports_2024 <- world_bank |>
filter(Time == 2024) |>
group_by(`Income Group`) |>
summarise(
Avg_Exports_Percent_GDP = mean(`Exports of goods and services (% of GDP)`, na.rm = TRUE),
.groups = "drop")
ggplot(exports_2024, aes(x = `Income Group`,y = Avg_Exports_Percent_GDP,fill = `Income Group`)) +
geom_col(alpha = 0.8) +
labs(
title = "Exports of Goods and Services by Income Group (2024)",
x = "Income Group",
y = "Average Exports (% of GDP)"
) +
theme_minimal()