These are the functions from dplyr that are used in this notes. Nevertheless, more functions can be found in the following Notes on Data Transformation with dplyr.
rm(list = ls())
setwd("C:/00 Pablo/Programacion/R/Apunte 101")
Loading packages. For this exercise we will import data from the World Bank Indicators package.
library(WDI) # for importing world development indicators from the World Bank
## Warning: package 'WDI' was built under R version 4.4.3
library(dplyr) # for manipulating data
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#library("writexl") # for exporting tables
Getting to know the package WDI.
# WDIsearch function
indicators <- WDIsearch(string = "", cache = NULL) # stores the list of indicators available
head(indicators)
## indicator name
## 1 1.0.HCount.1.90usd Poverty Headcount ($1.90 a day)
## 2 1.0.HCount.2.5usd Poverty Headcount ($2.50 a day)
## 3 1.0.HCount.Mid10to50 Middle Class ($10-50 a day) Headcount
## 4 1.0.HCount.Ofcl Official Moderate Poverty Rate-National
## 5 1.0.HCount.Poor4uds Poverty Headcount ($4 a day)
## 6 1.0.HCount.Vul4to10 Vulnerable ($4-10 a day) Headcount
# We can also use the WDI search function to look for indicators that we are interested in, like "GDP" or "poverty"
#View(WDIsearch("GDP")) # with View() we can see the whole data frame with all the variables that have "GDP" in their name or description
head(WDIsearch("GDP")) # we use head to see only the first six results
## indicator name
## 712 5.51.01.10.gdp Per capita GDP growth
## 714 6.0.GDP_current GDP (current $)
## 715 6.0.GDP_growth GDP growth (annual %)
## 716 6.0.GDP_usd GDP (constant 2005 $)
## 717 6.0.GDPpc_constant GDP per capita, PPP (constant 2011 international $)
## 1557 BG.GSR.NFSV.GD.ZS Trade in services (% of GDP)
head(WDIsearch("poverty")) # same but now with "poverty"
## indicator name
## 1 1.0.HCount.1.90usd Poverty Headcount ($1.90 a day)
## 2 1.0.HCount.2.5usd Poverty Headcount ($2.50 a day)
## 4 1.0.HCount.Ofcl Official Moderate Poverty Rate-National
## 5 1.0.HCount.Poor4uds Poverty Headcount ($4 a day)
## 7 1.0.PGap.1.90usd Poverty Gap ($1.90 a day)
## 8 1.0.PGap.2.5usd Poverty Gap ($2.50 a day)
We import a few variables for a group of countries.
latam_countries <- c("ARG", "BRA", "CHL", "COL", "MEX", "PER", "URY")
list_indicators <- c("NY.GDP.PCAP.CD", # GDP p/c (current US$)
"SI.POV.DDAY", # Poverty ratio at $1.90/day (2011 PPP) (% of population)
"SP.URB.TOTL.IN.ZS", # Urban population (% of total population)
"SL.UEM.TOTL.NE.ZS", #Unemployment, total (% of total labor force) (national estimate)
"SE.SEC.CUAT.UP.ZS") #Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative)
datawdi <- WDI(country = latam_countries, indicator = list_indicators, start = 2022, end = 2023)
We rename some variables in order to make them more legible.
datawdi <- datawdi %>% rename(gdp_pc = "NY.GDP.PCAP.CD",
rate_poverty = "SI.POV.DDAY",
rate_urban = "SP.URB.TOTL.IN.ZS",
rate_unemployment = "SL.UEM.TOTL.NE.ZS",
rate_educ_sec = "SE.SEC.CUAT.UP.ZS")
We filter the data frame creating two different data sets that follow different conditions.
datawdi_23 <- datawdi %>% filter(year == 2023)
# we can also add more than one condition (for and we use &, and for or we use |)
datawdi_22_poverty <- datawdi %>% filter(year == 2022 & !is.na(rate_poverty))
Diferent ways to use the select function.
Select 1. Using variable names and also a vector of variables (they can be used alone too).
vector_variables <- c("rate_poverty", "rate_urban")
datawdi_02 <- datawdi %>% select(country, iso3c, year, all_of(vector_variables))
head(datawdi_02)
## country iso3c year rate_poverty rate_urban
## 1 Argentina ARG 2022 1.3 92.347
## 2 Argentina ARG 2023 1.2 92.463
## 3 Brazil BRA 2022 4.9 87.555
## 4 Brazil BRA 2023 3.8 87.788
## 5 Chile CHL 2022 0.5 87.912
## 6 Chile CHL 2023 NA 88.012
Select 2. Using starts_with function.
datawdi_01 <- datawdi %>% select(iso3c, year, starts_with("rate_"))
head(datawdi_01)
## iso3c year rate_poverty rate_urban rate_unemployment rate_educ_sec
## 1 ARG 2022 1.3 92.347 6.805 NA
## 2 ARG 2023 1.2 92.463 6.139 NA
## 3 BRA 2022 4.9 87.555 9.231 54.01000
## 4 BRA 2023 3.8 87.788 7.947 60.10563
## 5 CHL 2022 0.5 87.912 8.249 66.63000
## 6 CHL 2023 NA 88.012 9.013 68.42550
Select 3. Deleting one or more variables.
datawdi_03 <- datawdi %>% select(-country, -iso2c)
head(datawdi_03)
## iso3c year gdp_pc rate_poverty rate_urban rate_unemployment rate_educ_sec
## 1 ARG 2022 13935.681 1.3 92.347 6.805 NA
## 2 ARG 2023 14187.483 1.2 92.463 6.139 NA
## 3 BRA 2022 9281.333 4.9 87.555 9.231 54.01000
## 4 BRA 2023 10377.589 3.8 87.788 7.947 60.10563
## 5 CHL 2022 15405.617 0.5 87.912 8.249 66.63000
## 6 CHL 2023 17067.036 NA 88.012 9.013 68.42550
Mutate (basic).
Let’s say that we want to convert the percentage of poverty into decimal numbers.
datawdi <- datawdi %>% mutate(poverty_01 = rate_poverty/100)
summary(datawdi$rate_poverty)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.200 1.025 3.050 3.508 5.150 9.300 2
summary(datawdi$poverty_01)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00200 0.01025 0.03050 0.03508 0.05150 0.09300 2
Mutate (with if else).
Let’s say that we want to identify those countries with more than 90% of their population living in urban areas, and that have an unemployment rate lower than 5%.
datawdi <- datawdi %>% mutate(example_variable = ifelse(rate_urban > 85 & rate_unemployment < 8,
3, ifelse(rate_urban > 85 & rate_unemployment >= 8, 2, 1)))
table(datawdi$example_variable)
##
## 1 2 3
## 6 4 4
datawdi <- datawdi %>%
mutate(example_variable = factor(example_variable,
levels = c(3, 2, 1), # real values
labels = c("highly urbanized - low unemployment",
"highly urbanized - high unemployment",
"less urbanized"))) # labels
# One way to show the results with the table function, and then creating a matrix with that content (the matrix function can be skipped, is just about the desired format)
table(datawdi$example_variable) %>% as.matrix()
## [,1]
## highly urbanized - low unemployment 4
## highly urbanized - high unemployment 4
## less urbanized 6
# Another way is to use the count function, which creates a data frame
datawdi %>% count(example_variable)
## example_variable n
## 1 highly urbanized - low unemployment 4
## 2 highly urbanized - high unemployment 4
## 3 less urbanized 6
In case we want to organize the dataset according to one or two variables. By default arrange works in ascending order, but it can be also defined in descending order using desc().
datawdi <- datawdi %>% arrange(year)
head(datawdi)
## country iso2c iso3c year gdp_pc rate_poverty rate_urban
## 1 Argentina AR ARG 2022 13935.681 1.3 92.347
## 2 Brazil BR BRA 2022 9281.333 4.9 87.555
## 3 Chile CL CHL 2022 15405.617 0.5 87.912
## 4 Colombia CO COL 2022 6680.445 9.3 82.050
## 5 Mexico MX MEX 2022 11402.141 2.3 81.300
## 6 Peru PE PER 2022 7350.630 4.8 78.708
## rate_unemployment rate_educ_sec poverty_01
## 1 6.805 NA 0.013
## 2 9.231 54.01 0.049
## 3 8.249 66.63 0.005
## 4 10.547 56.08 0.093
## 5 3.256 37.90 0.023
## 6 4.291 56.45 0.048
## example_variable
## 1 highly urbanized - low unemployment
## 2 highly urbanized - high unemployment
## 3 highly urbanized - high unemployment
## 4 less urbanized
## 5 less urbanized
## 6 less urbanized
datawdi <- datawdi %>% arrange(desc(year), country)
head(datawdi)
## country iso2c iso3c year gdp_pc rate_poverty rate_urban
## 1 Argentina AR ARG 2023 14187.483 1.2 92.463
## 2 Brazil BR BRA 2023 10377.589 3.8 87.788
## 3 Chile CL CHL 2023 17067.036 NA 88.012
## 4 Colombia CO COL 2023 7000.837 7.7 82.354
## 5 Mexico MX MEX 2023 13826.131 NA 81.582
## 6 Peru PE PER 2023 7887.542 5.9 78.922
## rate_unemployment rate_educ_sec poverty_01
## 1 6.139 NA 0.012
## 2 7.947 60.10563 0.038
## 3 9.013 68.42550 NA
## 4 9.603 57.02414 0.077
## 5 2.765 40.86215 NA
## 6 4.899 64.48654 0.059
## example_variable
## 1 highly urbanized - low unemployment
## 2 highly urbanized - low unemployment
## 3 highly urbanized - high unemployment
## 4 less urbanized
## 5 less urbanized
## 6 less urbanized
In case we want to delete duplicates, then we use the distinct function.
datawdi <- datawdi %>% arrange(year) # with this line we make sure that the data set will be in ascending order the first line will have the lowest value in year, i.e. 2022.
distinct(datawdi, country, .keep_all = TRUE) # keeps the first row that finds (from the top) and deletes the rest
## country iso2c iso3c year gdp_pc rate_poverty rate_urban
## 1 Argentina AR ARG 2022 13935.681 1.3 92.347
## 2 Brazil BR BRA 2022 9281.333 4.9 87.555
## 3 Chile CL CHL 2022 15405.617 0.5 87.912
## 4 Colombia CO COL 2022 6680.445 9.3 82.050
## 5 Mexico MX MEX 2022 11402.141 2.3 81.300
## 6 Peru PE PER 2022 7350.630 4.8 78.708
## 7 Uruguay UY URY 2022 20818.615 0.2 95.688
## rate_unemployment rate_educ_sec poverty_01
## 1 6.805 NA 0.013
## 2 9.231 54.01 0.049
## 3 8.249 66.63 0.005
## 4 10.547 56.08 0.093
## 5 3.256 37.90 0.023
## 6 4.291 56.45 0.048
## 7 7.877 34.79 0.002
## example_variable
## 1 highly urbanized - low unemployment
## 2 highly urbanized - high unemployment
## 3 highly urbanized - high unemployment
## 4 less urbanized
## 5 less urbanized
## 6 less urbanized
## 7 highly urbanized - low unemployment
datawdi <- datawdi %>% arrange(desc(year)) # now the year 2023 is going to be at the top rows
distinct(datawdi, country, .keep_all = TRUE) # so the year 2023 is the one that remains
## country iso2c iso3c year gdp_pc rate_poverty rate_urban
## 1 Argentina AR ARG 2023 14187.483 1.2 92.463
## 2 Brazil BR BRA 2023 10377.589 3.8 87.788
## 3 Chile CL CHL 2023 17067.036 NA 88.012
## 4 Colombia CO COL 2023 7000.837 7.7 82.354
## 5 Mexico MX MEX 2023 13826.131 NA 81.582
## 6 Peru PE PER 2023 7887.542 5.9 78.922
## 7 Uruguay UY URY 2023 23019.422 0.2 95.771
## rate_unemployment rate_educ_sec poverty_01
## 1 6.139 NA 0.012
## 2 7.947 60.10563 0.038
## 3 9.013 68.42550 NA
## 4 9.603 57.02414 0.077
## 5 2.765 40.86215 NA
## 6 4.899 64.48654 0.059
## 7 8.355 33.71787 0.002
## example_variable
## 1 highly urbanized - low unemployment
## 2 highly urbanized - low unemployment
## 3 highly urbanized - high unemployment
## 4 less urbanized
## 5 less urbanized
## 6 less urbanized
## 7 highly urbanized - high unemployment
distinct(datawdi, country) # this time we keep only the variable (or variables) that we are using for the combinations of duplicates that we are deleting
## country
## 1 Argentina
## 2 Brazil
## 3 Chile
## 4 Colombia
## 5 Mexico
## 6 Peru
## 7 Uruguay