Data reading

We will look at two data sets that we have already encountered:

## WDI data
my_indicators <- c(
  life_expectancy = "SP.DYN.LE00.IN", 
  gdp_capita ="NY.GDP.PCAP.CD", 
  pop = "SP.POP.TOTL"
  )

ind0 <- wb_data(my_indicators, start_date = 2018) 


ind <- ind0 %>% 
  left_join(wb_countries() %>% select(-country), by="iso3c") 

Next, the data for indicator

# Getting the data for an indicator with goalie
pov=sdg_data("SI_POV_DAY1")

In order to facilitate the work if you are working away from the computer room, I have saved the data within the project and you can recover it automatically using:

load("data.RData")

Remember that we can chain operators with the afterwards operator %>%. In RStudio you get with CTRL+SHIFT+M.

Direct manipulation

filter

pov %>% filter(GeoAreaName=="Africa")
## # A tibble: 12 x 20
##     Goal Target Indicator SeriesCode SeriesDescripti~ GeoAreaCode GeoAreaName
##    <dbl>  <dbl> <chr>     <chr>      <chr>                  <dbl> <chr>      
##  1     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~           2 Africa     
##  2     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~           2 Africa     
##  3     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~           2 Africa     
##  4     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~           2 Africa     
##  5     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~           2 Africa     
##  6     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~           2 Africa     
##  7     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~           2 Africa     
##  8     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~           2 Africa     
##  9     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~           2 Africa     
## 10     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~           2 Africa     
## 11     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~           2 Africa     
## 12     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~           2 Africa     
## # ... with 13 more variables: TimePeriod <dbl>, Value <dbl>, Time_Detail <dbl>,
## #   TimeCoverage <lgl>, UpperBound <lgl>, LowerBound <lgl>, BasePeriod <lgl>,
## #   Source <chr>, GeoInfoUrl <lgl>, FootNote <chr>, Nature <chr>,
## #   Reporting_Type <chr>, Units <chr>

select

pov %>% filter(GeoAreaName=="Africa") %>% 
  select(GeoAreaName,TimePeriod,Value)
## # A tibble: 12 x 3
##    GeoAreaName TimePeriod Value
##    <chr>            <dbl> <dbl>
##  1 Africa            1990    46
##  2 Africa            1993    50
##  3 Africa            1996    49
##  4 Africa            1999    49
##  5 Africa            2002    47
##  6 Africa            2005    43
##  7 Africa            2008    41
##  8 Africa            2010    39
##  9 Africa            2011    38
## 10 Africa            2012    37
## 11 Africa            2013    37
## 12 Africa            2015    36

Note that you can also select and rename in one shot

pov %>% filter(GeoAreaName=="Africa") %>% 
  select(Area=GeoAreaName,Year=TimePeriod,Poverty=Value)
## # A tibble: 12 x 3
##    Area    Year Poverty
##    <chr>  <dbl>   <dbl>
##  1 Africa  1990      46
##  2 Africa  1993      50
##  3 Africa  1996      49
##  4 Africa  1999      49
##  5 Africa  2002      47
##  6 Africa  2005      43
##  7 Africa  2008      41
##  8 Africa  2010      39
##  9 Africa  2011      38
## 10 Africa  2012      37
## 11 Africa  2013      37
## 12 Africa  2015      36

rename

You can also do the same with rename

pov %>% filter(GeoAreaName=="Africa") %>% 
  select(GeoAreaName,TimePeriod,Value) %>% 
  rename(Area=GeoAreaName,Year=TimePeriod,Poverty=Value)
## # A tibble: 12 x 3
##    Area    Year Poverty
##    <chr>  <dbl>   <dbl>
##  1 Africa  1990      46
##  2 Africa  1993      50
##  3 Africa  1996      49
##  4 Africa  1999      49
##  5 Africa  2002      47
##  6 Africa  2005      43
##  7 Africa  2008      41
##  8 Africa  2010      39
##  9 Africa  2011      38
## 10 Africa  2012      37
## 11 Africa  2013      37
## 12 Africa  2015      36

arrange

pov %>% filter(TimePeriod=="2000") %>% 
  arrange(-Value) %>% 
  head()
## # A tibble: 6 x 20
##    Goal Target Indicator SeriesCode SeriesDescripti~ GeoAreaCode GeoAreaName
##   <dbl>  <dbl> <chr>     <chr>      <chr>                  <dbl> <chr>      
## 1     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~         834 United Rep~
## 2     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~         646 Rwanda     
## 3     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~         860 Uzbekistan 
## 4     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~         748 Eswatini   
## 5     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~         417 Kyrgyzstan 
## 6     1    1.1 1.1.1     SI_POV_DA~ Proportion of p~         360 Indonesia  
## # ... with 13 more variables: TimePeriod <dbl>, Value <dbl>, Time_Detail <dbl>,
## #   TimeCoverage <lgl>, UpperBound <lgl>, LowerBound <lgl>, BasePeriod <lgl>,
## #   Source <chr>, GeoInfoUrl <lgl>, FootNote <chr>, Nature <chr>,
## #   Reporting_Type <chr>, Units <chr>

mutate

pov2 = pov %>% arrange(GeoAreaName,TimePeriod) %>% 
  select(contains("GeoArea"),Year=TimePeriod,Value,Units) %>% 
  group_by(GeoAreaName) %>% 
  mutate(Change=(Value-lag(Value))/Value*100)


pov2 %>% filter(GeoAreaName=="Asia")
## # A tibble: 13 x 6
## # Groups:   GeoAreaName [1]
##    GeoAreaCode GeoAreaName  Year Value Units    Change
##          <dbl> <chr>       <dbl> <dbl> <chr>     <dbl>
##  1         142 Asia         1990    49 PERCENT   NA   
##  2         142 Asia         1993    44 PERCENT  -11.4 
##  3         142 Asia         1996    36 PERCENT  -22.2 
##  4         142 Asia         1999    35 PERCENT   -2.86
##  5         142 Asia         2002    30 PERCENT  -16.7 
##  6         142 Asia         2005    23 PERCENT  -30.4 
##  7         142 Asia         2008    19 PERCENT  -21.1 
##  8         142 Asia         2010    15 PERCENT  -26.7 
##  9         142 Asia         2011    12 PERCENT  -25   
## 10         142 Asia         2012    11 PERCENT   -9.09
## 11         142 Asia         2013     9 PERCENT  -22.2 
## 12         142 Asia         2015     6 PERCENT  -50   
## 13         142 Asia         2018     2 PERCENT -200

Problem: When the period between observation is different the figures are not directly comparable! One might want to get average change per year: Can you do that?

join

We can put together two different data objects. That is what we have done previously with left_join. In that case, we got the information for the different countries from wb_countries().

There are different standardized ways to identify countries: ISO-3c, a 3-letter code, ISO-2c, a two-letter code, or ISO-number. All of them are available in the package countrycode. We could use the object codelist, but there is a simpler alternative with the function countrycode. In order to get the ISO-3c from the numeric code we can do:

countrycode(c(4,20),origin="iso3n",dest="iso3c")
## [1] "AFG" "AND"

Countries with codes 4 and 20 are Afghanistan and Andorra. To do it for all the data, we need a mutate

pov2= pov2 %>% 
  mutate(iso_a3=countrycode(GeoAreaCode,"iso3n","iso3c"))

Average and extreme levels

We look now at ways to summarize the data. Summarize provides one observation for each group defined by group_by. In general, after group_by, anything operates at the group level. If you want to switch back to all the data you need to ungroup.

## Average (weighted) levels by continent

ind %>% 
  group_by(region) %>%  # Analyze each region separately
  drop_na(pop) %>%      # Drop countries with no pop data (Eritrea)
  summarize(Average=weighted.mean(life_expectancy,pop,na.rm=TRUE),
            Countries=n(),
            Missing=sum(is.na(life_expectancy)),
            `Pop (millions)`=sum(pop)/1e6
            ) %>%      # Calculate summary statistics by group
  arrange(Average) %>% # Order regions according to increasing average level
  kable(digits=2)      # Put in a nice table, keeping two digits
region Average Countries Missing Pop (millions)
Sub-Saharan Africa 61.26 47 0 1074.85
South Asia 69.41 8 0 1814.39
Middle East & North Africa 74.09 21 0 448.91
Latin America & Caribbean 75.44 42 7 640.47
East Asia & Pacific 76.02 37 6 2304.56
Europe & Central Asia 77.91 58 6 917.92
North America 78.89 3 0 363.81

Comment on average levels by region

ind %>% 
  arrange(-life_expectancy) %>%   # Order in descending order of variable
  select(country,region,`Life expectancy`=life_expectancy) %>% # Select some variables 
  slice(1:5)  %>%        # Get the top five
  kable(digits=1)        # Nice table with 1 decimal digit.
country region Life expectancy
Hong Kong SAR, China East Asia & Pacific 84.9
Japan East Asia & Pacific 84.2
Macao SAR, China East Asia & Pacific 84.1
Switzerland Europe & Central Asia 83.8
Spain Europe & Central Asia 83.4

Comment on top countries

ind %>% arrange(life_expectancy) %>% 
  select(country,region,`Life expectancy`=life_expectancy) %>% slice(1:5) %>% 
  kable(digits=1)
country region Life expectancy
Central African Republic Sub-Saharan Africa 52.8
Lesotho Sub-Saharan Africa 53.7
Chad Sub-Saharan Africa 54.0
Sierra Leone Sub-Saharan Africa 54.3
Nigeria Sub-Saharan Africa 54.3

Comment on bottom countries

Graphs according to per-capita income

ind %>%
  ggplot(aes(
      x = gdp_capita, 
      y = life_expectancy,
      )) +
  geom_point(aes(size=pop, color=region)) +
  geom_smooth(se = FALSE) +
  scale_x_log10(
    labels = scales::dollar_format(),
    breaks = scales::log_breaks(n = 4)
  ) +
  scale_size_continuous(
    labels = scales::number_format(scale = 1/1e6, suffix = "m"),
    breaks = seq(1e8,1e9, 2e8),
    range = c(1,20)
    ) +
  theme_minimal() +
  labs(
    title = "Life expectancy according to GDP per capita",
    x = "GDP per Capita (log scale)",
    y = "years",
    size = "Population",
    color = NULL,
    caption = "Source: World Bank"
  ) 

Global map

We use another package that is already available in the computers, tmap. We can do the map of poverty for the last year of data like this

data(World)

World <- World %>%
  filter(iso_a3 != "ATA") %>% # remove Antarctica
  left_join(pov2 %>% group_by(GeoAreaName) %>% arrange(-Year) %>% slice(1),
             by="iso_a3",suffix=c("","")) # Join the data

Now we do the new map

World %>% ggplot(aes(fill = Value)) +
  geom_sf() +
  scale_fill_viridis_c() +
  theme(legend.position="bottom") +
  labs(
    title = "Map of poverty (below 1$) in the last year",
    fill = NULL,
    caption = paste("Source: Official SDG indicators retrieved with `goalie`") 
  )

References

Ismay, C. and A. Kim (2021) “Statistical Inference via Data Science: A ModernDive into R and the Tidyverse”, https://moderndive.netlify.app/

Engel, C. (2021) “Data Wrangling with R”, https://cengel.github.io/R-data-wrangling/

Wickham, Hadley and Grolemund, Garrett. (2021) R for Data Science: Import, Tidy, Transform, Visualize, and Model Data. O’Reilly Media, 2017. Online at http://r4ds.had.co.nz/