Table of contents:

  1. rename
  2. filter
  3. select
  4. mutate (+ ifelse)
  5. arrange
  6. distinct

These are the functions from dplyr that are used in this notes. Nevertheless, more functions can be found in the following Notes on Data Transformation with dplyr.

rm(list = ls())
setwd("C:/00 Pablo/Programacion/R/Apunte 101")

Loading packages. For this exercise we will import data from the World Bank Indicators package.

library(WDI) # for importing world development indicators from the World Bank
## Warning: package 'WDI' was built under R version 4.4.3
library(dplyr) # for manipulating data
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#library("writexl") # for exporting tables

Getting to know the package WDI.

# WDIsearch function
indicators <- WDIsearch(string = "", cache = NULL) # stores the list of indicators available
head(indicators)
##              indicator                                    name
## 1   1.0.HCount.1.90usd         Poverty Headcount ($1.90 a day)
## 2    1.0.HCount.2.5usd         Poverty Headcount ($2.50 a day)
## 3 1.0.HCount.Mid10to50   Middle Class ($10-50 a day) Headcount
## 4      1.0.HCount.Ofcl Official Moderate Poverty Rate-National
## 5  1.0.HCount.Poor4uds            Poverty Headcount ($4 a day)
## 6  1.0.HCount.Vul4to10      Vulnerable ($4-10 a day) Headcount
# We can also use the WDI search function to look for indicators that we are interested in, like "GDP" or "poverty"

#View(WDIsearch("GDP")) # with View() we can see the whole data frame with all the variables that have "GDP" in their name or description
head(WDIsearch("GDP")) # we use head to see only the first six results
##               indicator                                                 name
## 712      5.51.01.10.gdp                                Per capita GDP growth
## 714     6.0.GDP_current                                      GDP (current $)
## 715      6.0.GDP_growth                                GDP growth (annual %)
## 716         6.0.GDP_usd                                GDP (constant 2005 $)
## 717  6.0.GDPpc_constant GDP per capita, PPP (constant 2011 international $) 
## 1557  BG.GSR.NFSV.GD.ZS                         Trade in services (% of GDP)
head(WDIsearch("poverty")) # same but now with "poverty"
##             indicator                                    name
## 1  1.0.HCount.1.90usd         Poverty Headcount ($1.90 a day)
## 2   1.0.HCount.2.5usd         Poverty Headcount ($2.50 a day)
## 4     1.0.HCount.Ofcl Official Moderate Poverty Rate-National
## 5 1.0.HCount.Poor4uds            Poverty Headcount ($4 a day)
## 7    1.0.PGap.1.90usd               Poverty Gap ($1.90 a day)
## 8     1.0.PGap.2.5usd               Poverty Gap ($2.50 a day)

We import a few variables for a group of countries.

latam_countries <- c("ARG", "BRA", "CHL", "COL", "MEX", "PER", "URY")

list_indicators <- c("NY.GDP.PCAP.CD",  # GDP p/c (current US$)
                     "SI.POV.DDAY",     # Poverty ratio at $1.90/day (2011 PPP) (% of population)
                     "SP.URB.TOTL.IN.ZS", # Urban population (% of total population)
                     "SL.UEM.TOTL.NE.ZS", #Unemployment, total (% of total labor force) (national estimate)
                     "SE.SEC.CUAT.UP.ZS") #Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)

datawdi <- WDI(country = latam_countries, indicator = list_indicators, start = 2022, end = 2023)

1. Rename

We rename some variables in order to make them more legible.

datawdi <- datawdi %>% rename(gdp_pc = "NY.GDP.PCAP.CD",
                              rate_poverty = "SI.POV.DDAY",
                              rate_urban = "SP.URB.TOTL.IN.ZS",
                              rate_unemployment = "SL.UEM.TOTL.NE.ZS",
                              rate_educ_sec = "SE.SEC.CUAT.UP.ZS")

2. Filter

We filter the data frame creating two different data sets that follow different conditions.

datawdi_23 <- datawdi %>% filter(year == 2023)

# we can also add more than one condition (for and we use &, and for or we use |)
datawdi_22_poverty <- datawdi %>% filter(year == 2022 & !is.na(rate_poverty))

3. Select

Diferent ways to use the select function.

Select 1. Using variable names and also a vector of variables (they can be used alone too).

vector_variables <- c("rate_poverty", "rate_urban")
datawdi_02 <- datawdi %>% select(country, iso3c, year, all_of(vector_variables))
head(datawdi_02)
##     country iso3c year rate_poverty rate_urban
## 1 Argentina   ARG 2022          1.3     92.347
## 2 Argentina   ARG 2023          1.2     92.463
## 3    Brazil   BRA 2022          4.9     87.555
## 4    Brazil   BRA 2023          3.8     87.788
## 5     Chile   CHL 2022          0.5     87.912
## 6     Chile   CHL 2023           NA     88.012

Select 2. Using starts_with function.

datawdi_01 <- datawdi %>% select(iso3c, year, starts_with("rate_"))
head(datawdi_01)
##   iso3c year rate_poverty rate_urban rate_unemployment rate_educ_sec
## 1   ARG 2022          1.3     92.347             6.805            NA
## 2   ARG 2023          1.2     92.463             6.139            NA
## 3   BRA 2022          4.9     87.555             9.231      54.01000
## 4   BRA 2023          3.8     87.788             7.947      60.10563
## 5   CHL 2022          0.5     87.912             8.249      66.63000
## 6   CHL 2023           NA     88.012             9.013      68.42550

Select 3. Deleting one or more variables.

datawdi_03 <- datawdi %>% select(-country, -iso2c)
head(datawdi_03)
##   iso3c year    gdp_pc rate_poverty rate_urban rate_unemployment rate_educ_sec
## 1   ARG 2022 13935.681          1.3     92.347             6.805            NA
## 2   ARG 2023 14187.483          1.2     92.463             6.139            NA
## 3   BRA 2022  9281.333          4.9     87.555             9.231      54.01000
## 4   BRA 2023 10377.589          3.8     87.788             7.947      60.10563
## 5   CHL 2022 15405.617          0.5     87.912             8.249      66.63000
## 6   CHL 2023 17067.036           NA     88.012             9.013      68.42550

4. Mutate

Mutate (basic).

Let’s say that we want to convert the percentage of poverty into decimal numbers.

datawdi <- datawdi %>% mutate(poverty_01 = rate_poverty/100)

summary(datawdi$rate_poverty)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.200   1.025   3.050   3.508   5.150   9.300       2
summary(datawdi$poverty_01)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00200 0.01025 0.03050 0.03508 0.05150 0.09300       2

Mutate (with if else).

Let’s say that we want to identify those countries with more than 90% of their population living in urban areas, and that have an unemployment rate lower than 5%.

datawdi <- datawdi %>% mutate(example_variable = ifelse(rate_urban > 85 & rate_unemployment < 8,
                                                        3, ifelse(rate_urban  > 85 & rate_unemployment >= 8, 2, 1)))

table(datawdi$example_variable)
## 
## 1 2 3 
## 6 4 4
datawdi <- datawdi %>%
  mutate(example_variable = factor(example_variable,
                                   levels = c(3, 2, 1),  # real values
                                   labels = c("highly urbanized - low unemployment",
                                              "highly urbanized - high unemployment",
                                              "less urbanized"))) # labels

# One way to show the results with the table function, and then creating a matrix with that content (the matrix function can be skipped, is just about the desired format)
table(datawdi$example_variable) %>% as.matrix()
##                                      [,1]
## highly urbanized - low unemployment     4
## highly urbanized - high unemployment    4
## less urbanized                          6
# Another way is to use the count function, which creates a data frame
datawdi %>% count(example_variable)
##                       example_variable n
## 1  highly urbanized - low unemployment 4
## 2 highly urbanized - high unemployment 4
## 3                       less urbanized 6

5. Arrange

In case we want to organize the dataset according to one or two variables. By default arrange works in ascending order, but it can be also defined in descending order using desc().

datawdi <- datawdi %>% arrange(year)
head(datawdi)
##     country iso2c iso3c year    gdp_pc rate_poverty rate_urban
## 1 Argentina    AR   ARG 2022 13935.681          1.3     92.347
## 2    Brazil    BR   BRA 2022  9281.333          4.9     87.555
## 3     Chile    CL   CHL 2022 15405.617          0.5     87.912
## 4  Colombia    CO   COL 2022  6680.445          9.3     82.050
## 5    Mexico    MX   MEX 2022 11402.141          2.3     81.300
## 6      Peru    PE   PER 2022  7350.630          4.8     78.708
##   rate_unemployment rate_educ_sec poverty_01
## 1             6.805            NA      0.013
## 2             9.231         54.01      0.049
## 3             8.249         66.63      0.005
## 4            10.547         56.08      0.093
## 5             3.256         37.90      0.023
## 6             4.291         56.45      0.048
##                       example_variable
## 1  highly urbanized - low unemployment
## 2 highly urbanized - high unemployment
## 3 highly urbanized - high unemployment
## 4                       less urbanized
## 5                       less urbanized
## 6                       less urbanized
datawdi <- datawdi %>% arrange(desc(year), country)
head(datawdi)
##     country iso2c iso3c year    gdp_pc rate_poverty rate_urban
## 1 Argentina    AR   ARG 2023 14187.483          1.2     92.463
## 2    Brazil    BR   BRA 2023 10377.589          3.8     87.788
## 3     Chile    CL   CHL 2023 17067.036           NA     88.012
## 4  Colombia    CO   COL 2023  7000.837          7.7     82.354
## 5    Mexico    MX   MEX 2023 13826.131           NA     81.582
## 6      Peru    PE   PER 2023  7887.542          5.9     78.922
##   rate_unemployment rate_educ_sec poverty_01
## 1             6.139            NA      0.012
## 2             7.947      60.10563      0.038
## 3             9.013      68.42550         NA
## 4             9.603      57.02414      0.077
## 5             2.765      40.86215         NA
## 6             4.899      64.48654      0.059
##                       example_variable
## 1  highly urbanized - low unemployment
## 2  highly urbanized - low unemployment
## 3 highly urbanized - high unemployment
## 4                       less urbanized
## 5                       less urbanized
## 6                       less urbanized

6. Distinct

In case we want to delete duplicates, then we use the distinct function.

datawdi <- datawdi %>% arrange(year) # with this line we make sure that the data set will be in ascending order the first line will have the lowest value in year, i.e. 2022.
distinct(datawdi, country, .keep_all = TRUE) # keeps the first row that finds (from the top) and deletes the rest
##     country iso2c iso3c year    gdp_pc rate_poverty rate_urban
## 1 Argentina    AR   ARG 2022 13935.681          1.3     92.347
## 2    Brazil    BR   BRA 2022  9281.333          4.9     87.555
## 3     Chile    CL   CHL 2022 15405.617          0.5     87.912
## 4  Colombia    CO   COL 2022  6680.445          9.3     82.050
## 5    Mexico    MX   MEX 2022 11402.141          2.3     81.300
## 6      Peru    PE   PER 2022  7350.630          4.8     78.708
## 7   Uruguay    UY   URY 2022 20818.615          0.2     95.688
##   rate_unemployment rate_educ_sec poverty_01
## 1             6.805            NA      0.013
## 2             9.231         54.01      0.049
## 3             8.249         66.63      0.005
## 4            10.547         56.08      0.093
## 5             3.256         37.90      0.023
## 6             4.291         56.45      0.048
## 7             7.877         34.79      0.002
##                       example_variable
## 1  highly urbanized - low unemployment
## 2 highly urbanized - high unemployment
## 3 highly urbanized - high unemployment
## 4                       less urbanized
## 5                       less urbanized
## 6                       less urbanized
## 7  highly urbanized - low unemployment
datawdi <- datawdi %>% arrange(desc(year)) # now the year 2023 is going to be at the top rows
distinct(datawdi, country, .keep_all = TRUE) # so the year 2023 is the one that remains
##     country iso2c iso3c year    gdp_pc rate_poverty rate_urban
## 1 Argentina    AR   ARG 2023 14187.483          1.2     92.463
## 2    Brazil    BR   BRA 2023 10377.589          3.8     87.788
## 3     Chile    CL   CHL 2023 17067.036           NA     88.012
## 4  Colombia    CO   COL 2023  7000.837          7.7     82.354
## 5    Mexico    MX   MEX 2023 13826.131           NA     81.582
## 6      Peru    PE   PER 2023  7887.542          5.9     78.922
## 7   Uruguay    UY   URY 2023 23019.422          0.2     95.771
##   rate_unemployment rate_educ_sec poverty_01
## 1             6.139            NA      0.012
## 2             7.947      60.10563      0.038
## 3             9.013      68.42550         NA
## 4             9.603      57.02414      0.077
## 5             2.765      40.86215         NA
## 6             4.899      64.48654      0.059
## 7             8.355      33.71787      0.002
##                       example_variable
## 1  highly urbanized - low unemployment
## 2  highly urbanized - low unemployment
## 3 highly urbanized - high unemployment
## 4                       less urbanized
## 5                       less urbanized
## 6                       less urbanized
## 7 highly urbanized - high unemployment
distinct(datawdi, country) # this time we keep only the variable (or variables) that we are using for the combinations of duplicates that we are deleting
##     country
## 1 Argentina
## 2    Brazil
## 3     Chile
## 4  Colombia
## 5    Mexico
## 6      Peru
## 7   Uruguay