Data Dive

Week 2

Load required packages

library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load the World Bank Dataset

#fill '..' values in numerical columns with NA.
world_bank <- read_csv("C:/Users/SP KHALID/Downloads/WDI- World Bank Dataset.csv" , na = c('..')) 
## Rows: 1675 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Time Code, Country Name, Country Code, Region, Income Group
## dbl (14): Time, GDP (constant 2015 US$), GDP growth (annual %), GDP (current...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
world_bank
## # A tibble: 1,675 × 19
##     Time `Time Code` `Country Name` `Country Code` Region         `Income Group`
##    <dbl> <chr>       <chr>          <chr>          <chr>          <chr>         
##  1  2000 YR2000      Brazil         BRA            Latin America… Upper middle …
##  2  2000 YR2000      China          CHN            East Asia & P… Upper middle …
##  3  2000 YR2000      France         FRA            Europe & Cent… High income   
##  4  2000 YR2000      Germany        DEU            Europe & Cent… High income   
##  5  2000 YR2000      India          IND            South Asia     Lower middle …
##  6  2000 YR2000      Indonesia      IDN            East Asia & P… Upper middle …
##  7  2000 YR2000      Italy          ITA            Europe & Cent… High income   
##  8  2000 YR2000      Japan          JPN            East Asia & P… High income   
##  9  2000 YR2000      Korea, Rep.    KOR            East Asia & P… High income   
## 10  2000 YR2000      Mexico         MEX            Latin America… Upper middle …
## # ℹ 1,665 more rows
## # ℹ 13 more variables: `GDP (constant 2015 US$)` <dbl>,
## #   `GDP growth (annual %)` <dbl>, `GDP (current US$)` <dbl>,
## #   `Unemployment, total (% of total labor force)` <dbl>,
## #   `Inflation, consumer prices (annual %)` <dbl>, `Labor force, total` <dbl>,
## #   `Population, total` <dbl>,
## #   `Exports of goods and services (% of GDP)` <dbl>, …
dim(world_bank)
## [1] 1675   19
# Check column data types
glimpse(world_bank)
## Rows: 1,675
## Columns: 19
## $ Time                                                          <dbl> 2000, 20…
## $ `Time Code`                                                   <chr> "YR2000"…
## $ `Country Name`                                                <chr> "Brazil"…
## $ `Country Code`                                                <chr> "BRA", "…
## $ Region                                                        <chr> "Latin A…
## $ `Income Group`                                                <chr> "Upper m…
## $ `GDP (constant 2015 US$)`                                     <dbl> 1.18642e…
## $ `GDP growth (annual %)`                                       <dbl> 4.387949…
## $ `GDP (current US$)`                                           <dbl> 6.554482…
## $ `Unemployment, total (% of total labor force)`                <dbl> NA, 3.70…
## $ `Inflation, consumer prices (annual %)`                       <dbl> 7.044141…
## $ `Labor force, total`                                          <dbl> 80295093…
## $ `Population, total`                                           <dbl> 17401828…
## $ `Exports of goods and services (% of GDP)`                    <dbl> 10.18805…
## $ `Imports of goods and services (% of GDP)`                    <dbl> 12.45171…
## $ `General government final consumption expenditure (% of GDP)` <dbl> 18.76784…
## $ `Foreign direct investment, net inflows (% of GDP)`           <dbl> 5.033917…
## $ `Gross savings (% of GDP)`                                    <dbl> 13.99170…
## $ `Current account balance (% of GDP)`                          <dbl> -4.04774…
# Convert Time column to integer
world_bank$Time <- as.integer(world_bank$Time)
# Verify Datatypes
glimpse(world_bank)
## Rows: 1,675
## Columns: 19
## $ Time                                                          <int> 2000, 20…
## $ `Time Code`                                                   <chr> "YR2000"…
## $ `Country Name`                                                <chr> "Brazil"…
## $ `Country Code`                                                <chr> "BRA", "…
## $ Region                                                        <chr> "Latin A…
## $ `Income Group`                                                <chr> "Upper m…
## $ `GDP (constant 2015 US$)`                                     <dbl> 1.18642e…
## $ `GDP growth (annual %)`                                       <dbl> 4.387949…
## $ `GDP (current US$)`                                           <dbl> 6.554482…
## $ `Unemployment, total (% of total labor force)`                <dbl> NA, 3.70…
## $ `Inflation, consumer prices (annual %)`                       <dbl> 7.044141…
## $ `Labor force, total`                                          <dbl> 80295093…
## $ `Population, total`                                           <dbl> 17401828…
## $ `Exports of goods and services (% of GDP)`                    <dbl> 10.18805…
## $ `Imports of goods and services (% of GDP)`                    <dbl> 12.45171…
## $ `General government final consumption expenditure (% of GDP)` <dbl> 18.76784…
## $ `Foreign direct investment, net inflows (% of GDP)`           <dbl> 5.033917…
## $ `Gross savings (% of GDP)`                                    <dbl> 13.99170…
## $ `Current account balance (% of GDP)`                          <dbl> -4.04774…

Summary of all numerical columns

world_bank |>
  select(where(is.numeric)) |>
  summary()
##       Time      GDP (constant 2015 US$) GDP growth (annual %)
##  Min.   :2000   Min.   :1.649e+09       Min.   :-30.145      
##  1st Qu.:2006   1st Qu.:3.376e+10       1st Qu.:  1.888      
##  Median :2012   Median :2.160e+11       Median :  3.950      
##  Mean   :2012   Mean   :9.444e+11       Mean   :  3.899      
##  3rd Qu.:2018   3rd Qu.:7.585e+11       3rd Qu.:  6.050      
##  Max.   :2024   Max.   :2.257e+13       Max.   : 34.466      
##                 NA's   :1               NA's   :2            
##  GDP (current US$)   Unemployment, total (% of total labor force)
##  Min.   :6.359e+08   Min.   : 0.200                              
##  1st Qu.:2.468e+10   1st Qu.: 3.741                              
##  Median :1.915e+11   Median : 5.600                              
##  Mean   :9.409e+11   Mean   : 6.826                              
##  3rd Qu.:7.091e+11   3rd Qu.: 8.711                              
##  Max.   :2.875e+13   Max.   :34.007                              
##  NA's   :1           NA's   :414                                 
##  Inflation, consumer prices (annual %) Labor force, total  Population, total  
##  Min.   : -8.975                       Min.   :  1277880   Min.   :2.402e+06  
##  1st Qu.:  1.851                       1st Qu.:  4932834   1st Qu.:1.109e+07  
##  Median :  3.826                       Median : 11764216   Median :2.930e+07  
##  Mean   :  6.367                       Mean   : 41128373   Mean   :8.904e+07  
##  3rd Qu.:  7.849                       3rd Qu.: 30480443   3rd Qu.:7.070e+07  
##  Max.   :219.884                       Max.   :781187865   Max.   :1.451e+09  
##  NA's   :61                                                                   
##  Exports of goods and services (% of GDP)
##  Min.   :  3.401                         
##  1st Qu.: 18.479                         
##  Median : 27.923                         
##  Mean   : 33.755                         
##  3rd Qu.: 39.495                         
##  Max.   :228.994                         
##  NA's   :82                              
##  Imports of goods and services (% of GDP)
##  Min.   :  9.099                         
##  1st Qu.: 23.173                         
##  Median : 29.879                         
##  Mean   : 35.846                         
##  3rd Qu.: 39.961                         
##  Max.   :208.333                         
##  NA's   :82                              
##  General government final consumption expenditure (% of GDP)
##  Min.   : 4.766                                             
##  1st Qu.:11.162                                             
##  Median :14.756                                             
##  Mean   :14.919                                             
##  3rd Qu.:18.543                                             
##  Max.   :28.915                                             
##  NA's   :100                                                
##  Foreign direct investment, net inflows (% of GDP) Gross savings (% of GDP)
##  Min.   :-32.151                                   Min.   :-9.492          
##  1st Qu.:  1.014                                   1st Qu.:17.092          
##  Median :  2.348                                   Median :22.641          
##  Mean   :  3.831                                   Mean   :23.644          
##  3rd Qu.:  4.051                                   3rd Qu.:29.646          
##  Max.   :103.337                                   Max.   :52.752          
##  NA's   :1                                         NA's   :192             
##  Current account balance (% of GDP)
##  Min.   :-59.996                   
##  1st Qu.: -4.557                   
##  Median : -1.595                   
##  Mean   : -1.336                   
##  3rd Qu.:  2.151                   
##  Max.   : 33.679                   
##  NA's   :126

Summary of categorical columns

library(dplyr)

world_bank |>
  summarise(
   across(
     where(~ is.character(.) | is.factor(.)),
     ~ n_distinct(.)
     )
  )
## # A tibble: 1 × 5
##   `Time Code` `Country Name` `Country Code` Region `Income Group`
##         <int>          <int>          <int>  <int>          <int>
## 1          25             67             67      7              4

Number of Countries in each Income Group

income_group_summary <- world_bank |>
  group_by(`Country Code`) |>
  slice_max(Time, n = 1, with_ties = FALSE) |>
  ungroup() |>
  count(`Income Group`, name = "No of Countries")
income_group_summary
## # A tibble: 4 × 2
##   `Income Group`      `No of Countries`
##   <chr>                           <int>
## 1 High income                        24
## 2 Low income                         12
## 3 Lower middle income                15
## 4 Upper middle income                16

Number of Countries in each Region

 region_summary <- world_bank |>
  group_by(`Country Code`) |>                       
  slice_max(Time, n = 1, with_ties = FALSE) |>     
  ungroup() |>
  count(Region, name = "No of Countries")     
  region_summary
## # A tibble: 7 × 2
##   Region                     `No of Countries`
##   <chr>                                  <int>
## 1 East Asia & Pacific                       11
## 2 Europe & Central Asia                     18
## 3 Latin America & Caribbean                  9
## 4 Middle East & North Africa                 8
## 5 North America                              2
## 6 South Asia                                 4
## 7 Sub-Saharan Africa                        15

Questions

Q1.Which 5 countries have the largest population, and what are their average GDP (constant 2015) and GDP growth?

Larger economies like US and China have shown considerably shown higher GDP constant values over the years due to greater capital, technology, and productivity advantages.

q1 <- world_bank |>
  group_by(`Country Name`) |>
  summarise(
    Avg_GDP_Constant_2015_Trillion_USD = mean(`GDP (constant 2015 US$)`, na.rm = TRUE)/1e12,
    Avg_GDP_Growth = mean(`GDP growth (annual %)`, na.rm = TRUE),
    Avg_Population = mean(`Population, total`, na.rm = TRUE)
  ) |>
  arrange(desc(Avg_Population)) |>
  slice_head(n = 5) |>
  select(`Country Name`, Avg_Population, Avg_GDP_Constant_2015_Trillion_USD, Avg_GDP_Growth)
q1
## # A tibble: 5 × 4
##   `Country Name` Avg_Population Avg_GDP_Constant_2015_Trillion_…¹ Avg_GDP_Growth
##   <chr>                   <dbl>                             <dbl>          <dbl>
## 1 China             1351585200                              9.59            8.19
## 2 India             1270238266.                             1.86            6.26
## 3 United States      312853941.                            17.5             2.21
## 4 Indonesia          251600086.                             0.762           4.89
## 5 Pakistan           204426074.                             0.280           3.97
## # ℹ abbreviated name: ¹​Avg_GDP_Constant_2015_Trillion_USD

Q2 How has the United States’ GDP (current US$) changed over time?

For instance, if we look the timely data for US, we can see direct proptionality between GDP constant and year.

q2 <- world_bank |>
  filter(`Country Code` == "USA") |>
  group_by(Time) |>
  summarise(
    Avg_GDP_Trillion_USD = mean(`GDP (current US$)`,na.rm = TRUE)/1e12,
    .groups = "drop") |>
    arrange(Time)
q2
## # A tibble: 25 × 2
##     Time Avg_GDP_Trillion_USD
##    <int>                <dbl>
##  1  2000                 10.3
##  2  2001                 10.6
##  3  2002                 10.9
##  4  2003                 11.5
##  5  2004                 12.2
##  6  2005                 13.0
##  7  2006                 13.8
##  8  2007                 14.5
##  9  2008                 14.8
## 10  2009                 14.5
## # ℹ 15 more rows

Q3 What is the average exports of goods and services (% of GDP) for different income groups?

The Average export values for all of the groups are relatively comparable, specially the upper and lower middle income group. One of the reasons is the manufacturing potential in those countries like singapore, china and vietnam and cheap labor costs.

q3 <- world_bank |>
  group_by(`Income Group`) |>
  summarise(Avg_Exports_Percent_GDP = mean(`Exports of goods and services (% of GDP)`, na.rm = TRUE))
q3
## # A tibble: 4 × 2
##   `Income Group`      Avg_Exports_Percent_GDP
##   <chr>                                 <dbl>
## 1 High income                            44.4
## 2 Low income                             19.1
## 3 Lower middle income                    24.6
## 4 Upper middle income                    34.5

Visualization

Scatter plot : Average GDP constant vs Average annual GDP growth %

# Prepare data for scatter plot- mean of columns
scatter_data <- world_bank |>
  group_by(`Country Name`, `Income Group`) |>
  summarise(
    Avg_GDP_Growth = mean(`GDP growth (annual %)`,na.rm = TRUE),
    GDP_Constant_2015 = mean(`GDP (constant 2015 US$)`,na.rm = TRUE),
    Population = mean(`Population, total`, na.rm = TRUE),
    .groups = "drop"
  )
ggplot(scatter_data, aes(x = Avg_GDP_Growth, y = GDP_Constant_2015, color = `Income Group`, size = Population)) +
  geom_point(alpha = 0.6) +
  labs(
    title = "GDP Level vs Average GDP Growth",
    x = "Average GDP Growth (Annual %)",
    y = "GDP (Constant 2015 US$)",
    size = "Population",
    color = "Income Group"
  ) +
  theme_minimal()

Line Plot : Average annual GDP growth % for US over the years

However, even with higher GDP constant values, the average annual GDP growth has fluctuated for US over the years due to business cycles, shifts in consumer demand, and changes in monetary and fiscal policy.

# Prepare US time series data
us_gdp_growth <- world_bank |>
  filter(`Country Code` == "USA") |>
  select(Time,`GDP growth (annual %)`) |>
  arrange(Time)
ggplot(us_gdp_growth, aes(x = Time,y = `GDP growth (annual %)`)) +
  geom_line(linewidth = 1) +
  geom_point(alpha = 0.7) +
  labs(
    title = "United States: GDP Growth Rate Over Time",
    x = "Year",
    y = "GDP Growth (Annual %)"
  ) +
  theme_minimal()

Bar Chart : Export of Goods and services for different income groups in 2024

# Prepare export data
exports_2024 <- world_bank |>
  filter(Time == 2024) |>
  group_by(`Income Group`) |>
  summarise(
    Avg_Exports_Percent_GDP = mean(`Exports of goods and services (% of GDP)`, na.rm = TRUE),
    .groups = "drop")
ggplot(exports_2024, aes(x = `Income Group`,y = Avg_Exports_Percent_GDP,fill = `Income Group`)) +
  geom_col(alpha = 0.8) +
  labs(
    title = "Exports of Goods and Services by Income Group (2024)",
    x = "Income Group",
    y = "Average Exports (% of GDP)"
  ) +
  theme_minimal()