data

ds<- gapminder::gapminder

# colnames(ds)
# [1] "country"   "continent" "year"      "lifeExp"   "pop"       "gdpPercap"

select

select the first 3 columns

# you have many options
ds %>% select(1:3) %>% head
## # A tibble: 6 × 3
##   country     continent  year
##   <fct>       <fct>     <int>
## 1 Afghanistan Asia       1952
## 2 Afghanistan Asia       1957
## 3 Afghanistan Asia       1962
## 4 Afghanistan Asia       1967
## 5 Afghanistan Asia       1972
## 6 Afghanistan Asia       1977
# ds %>% select(1,2,3)
# ds %>% select(country:year)
# ds %>% select(-c(4:6))
# ds %>% select(-c(lifeExp:gdpPercap))
# ds %>% select(contains("c"))

filter

filter data for Jordan and the year 2007

ds %>% filter(country=="Jordan") %>% filter(year=="2007")
## # A tibble: 1 × 6
##   country continent  year lifeExp     pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>   <int>     <dbl>
## 1 Jordan  Asia       2007    72.5 6053193     4519.
ds %>% filter(country=="Jordan" & year=="2007")
## # A tibble: 1 × 6
##   country continent  year lifeExp     pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>   <int>     <dbl>
## 1 Jordan  Asia       2007    72.5 6053193     4519.
ds %>% filter(country=="Jordan" , year=="2007")
## # A tibble: 1 × 6
##   country continent  year lifeExp     pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>   <int>     <dbl>
## 1 Jordan  Asia       2007    72.5 6053193     4519.

summarise

What was the total population of the world in 2007

ds %>% filter(year=="2007") %>% summarise(Population=sum(pop))
## # A tibble: 1 × 1
##   Population
##        <dbl>
## 1 6251013179

group_by

What was the the population of each continent in 2007?

ds %>% filter(year=="2007") %>% group_by(continent) %>% summarise(Population_per_continent=sum(pop))
## # A tibble: 5 × 2
##   continent Population_per_continent
##   <fct>                        <dbl>
## 1 Africa                   929539692
## 2 Americas                 898871184
## 3 Asia                    3811953827
## 4 Europe                   586098529
## 5 Oceania                   24549947

mutate

GDP of each contry is population X gdpPercap. Calculate sum GDP for the whole world in 2007

ds %>% filter(year=="2007") %>% mutate(GDP=pop*gdpPercap) %>% summarise(Total_GDP=sum(GDP))
## # A tibble: 1 × 1
##   Total_GDP
##       <dbl>
## 1   5.81e13

GDP in JOD

ds %>% filter(year=="2007") %>% mutate(GDP=pop*gdpPercap, GDP_JOD=GDP/0.71) %>% summarise(Total_GDP=sum(GDP))
## # A tibble: 1 × 1
##   Total_GDP
##       <dbl>
## 1   5.81e13

group_by followed by mutate

What is the percentage of each country population per its continent’s total in 2007?

ds %>% filter(year=="2007") %>% group_by(continent) %>% mutate(cont_pop=sum(pop)) %>% ungroup() %>% mutate(country_Percent=pop*100/cont_pop) %>% head
## # A tibble: 6 × 8
##   country     continent  year lifeExp      pop gdpPercap   cont_pop country_Pe…¹
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>      <dbl>        <dbl>
## 1 Afghanistan Asia       2007    43.8 31889923      975. 3811953827        0.837
## 2 Albania     Europe     2007    76.4  3600523     5937.  586098529        0.614
## 3 Algeria     Africa     2007    72.3 33333216     6223.  929539692        3.59 
## 4 Angola      Africa     2007    42.7 12420476     4797.  929539692        1.34 
## 5 Argentina   Americas   2007    75.3 40301927    12779.  898871184        4.48 
## 6 Australia   Oceania    2007    81.2 20434176    34435.   24549947       83.2  
## # … with abbreviated variable name ¹​country_Percent

arrange

Arrange countries based on gpdOercap in 2007, maximum values on top.

ds %>% filter(year=="2007") %>% arrange(-gdpPercap) %>% head
## # A tibble: 6 × 6
##   country          continent  year lifeExp       pop gdpPercap
##   <fct>            <fct>     <int>   <dbl>     <int>     <dbl>
## 1 Norway           Europe     2007    80.2   4627926    49357.
## 2 Kuwait           Asia       2007    77.6   2505559    47307.
## 3 Singapore        Asia       2007    80.0   4553009    47143.
## 4 United States    Americas   2007    78.2 301139947    42952.
## 5 Ireland          Europe     2007    78.9   4109086    40676.
## 6 Hong Kong, China Asia       2007    82.2   6980412    39725.

distinct

Show the year that had the highest gpd for each country

ds %>% arrange(-gdpPercap) %>% distinct(country)
## # A tibble: 142 × 1
##    country         
##    <fct>           
##  1 Kuwait          
##  2 Norway          
##  3 Singapore       
##  4 United States   
##  5 Ireland         
##  6 Hong Kong, China
##  7 Switzerland     
##  8 Netherlands     
##  9 Canada          
## 10 Iceland         
## # … with 132 more rows
ds %>% arrange(-gdpPercap) %>% distinct(country, .keep_all = T) %>% head
## # A tibble: 6 × 6
##   country          continent  year lifeExp       pop gdpPercap
##   <fct>            <fct>     <int>   <dbl>     <int>     <dbl>
## 1 Kuwait           Asia       1957    58.0    212846   113523.
## 2 Norway           Europe     2007    80.2   4627926    49357.
## 3 Singapore        Asia       2007    80.0   4553009    47143.
## 4 United States    Americas   2007    78.2 301139947    42952.
## 5 Ireland          Europe     2007    78.9   4109086    40676.
## 6 Hong Kong, China Asia       2007    82.2   6980412    39725.

spread

make a table that has columns as names of continents, rows as years, and cells as total population per continent.

ds %>% select(continent, year, pop) %>% group_by(year, continent) %>% summarise(popPerCont=sum(pop)) %>% ungroup() %>%  spread(continent, popPerCont) 
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 6
##     year    Africa  Americas       Asia    Europe  Oceania
##    <int>     <dbl>     <dbl>      <dbl>     <dbl>    <dbl>
##  1  1952 237640501 345152446 1395357351 418120846 10686006
##  2  1957 264837738 386953916 1562780599 437890351 11941976
##  3  1962 296516865 433270254 1696357182 460355155 13283518
##  4  1967 335289489 480746623 1905662900 481178958 14600414
##  5  1972 379879541 529384210 2150972248 500635059 16106100
##  6  1977 433061021 578067699 2384513556 517164531 17239000
##  7  1982 499348587 630290920 2610135582 531266901 18394850
##  8  1987 574834110 682753971 2871220762 543094160 19574415
##  9  1992 659081517 739274104 3133292191 558142797 20919651
## 10  1997 743832984 796900410 3383285500 568944148 22241430
## 11  2002 833723916 849772762 3601802203 578223869 23454829
## 12  2007 929539692 898871184 3811953827 586098529 24549947

slice_max

What are the top 5 countries that have the best life expectency in 2007?

ds %>% filter(year=="2007") %>% slice_max(lifeExp, n=5)
## # A tibble: 5 × 6
##   country          continent  year lifeExp       pop gdpPercap
##   <fct>            <fct>     <int>   <dbl>     <int>     <dbl>
## 1 Japan            Asia       2007    82.6 127467972    31656.
## 2 Hong Kong, China Asia       2007    82.2   6980412    39725.
## 3 Iceland          Europe     2007    81.8    301931    36181.
## 4 Switzerland      Europe     2007    81.7   7554661    37506.
## 5 Australia        Oceania    2007    81.2  20434176    34435.
ds %>% filter(year=="2007") %>% slice_max(lifeExp, n=5) %>% pull(country)
## [1] Japan            Hong Kong, China Iceland          Switzerland     
## [5] Australia       
## 142 Levels: Afghanistan Albania Algeria Angola Argentina Australia ... Zimbabwe

Which country had the highest gpdPercap in each continent in 2007?

ds %>% filter(year=="2007") %>% group_by(continent) %>% slice_max(gdpPercap, n=1) %>% select(continent, country)
## # A tibble: 5 × 2
## # Groups:   continent [5]
##   continent country      
##   <fct>     <fct>        
## 1 Africa    Gabon        
## 2 Americas  United States
## 3 Asia      Kuwait       
## 4 Europe    Norway       
## 5 Oceania   Australia

count

how many contries had a population more than / lesss than 10 million in 2007

ds %>% filter(year=="2007") %>% mutate(pop10M=ifelse(pop>10e6, "More than 10 M", "Less than 10M")) %>%  count(pop10M)
## # A tibble: 2 × 2
##   pop10M             n
##   <chr>          <int>
## 1 Less than 10M     66
## 2 More than 10 M    76

recode

Change the name of Jordan to the Hashemite Kingdom of Jordan

ds %>% mutate(country=recode(country, Jordan="Hashemite Kingdom of Jordan")) %>% head
## # A tibble: 6 × 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.

across

ds %>% mutate(across(is.numeric, sqrt ))%>% head
## Warning: Predicate functions must be wrapped in `where()`.
## 
##   # Bad
##   data %>% select(is.numeric)
## 
##   # Good
##   data %>% select(where(is.numeric))
## 
## ℹ Please update your code.
## This message is displayed once per session.
## # A tibble: 6 × 6
##   country     continent  year lifeExp   pop gdpPercap
##   <fct>       <fct>     <dbl>   <dbl> <dbl>     <dbl>
## 1 Afghanistan Asia       44.2    5.37 2903.      27.9
## 2 Afghanistan Asia       44.2    5.51 3040.      28.7
## 3 Afghanistan Asia       44.3    5.66 3204.      29.2
## 4 Afghanistan Asia       44.4    5.83 3397.      28.9
## 5 Afghanistan Asia       44.4    6.01 3617.      27.2
## 6 Afghanistan Asia       44.5    6.20 3858.      28.0

Advanced Examples

function after group_by

https://dplyr.tidyverse.org/reference/group_map.html ## group_map for each continents , is there a significant association between the gpdPercap and lifeExp in 2007. show p value of linear regression models.

lmPval <- function(DATA) {
  broom::tidy(lm(DATA[["lifeExp"]]~DATA[["gdpPercap"]]))$p.value[2]
}

ds %>% filter(year=="2007") %>%  group_by(continent) %>% group_map(~lmPval(.x))  %>%  unlist
## [1] 4.861750e-03 1.866495e-03 9.130069e-06 2.795338e-09          NaN

group_modify

ds %>% 
  filter(year=="2007") %>%  
  group_by(continent) %>% 
  group_modify(~broom::tidy(lm(lifeExp~gdpPercap, data=.x))) %>% 
  filter(term=="gdpPercap")
## # A tibble: 5 × 6
## # Groups:   continent [5]
##   continent term      estimate   std.error statistic   p.value
##   <fct>     <chr>        <dbl>       <dbl>     <dbl>     <dbl>
## 1 Africa    gdpPercap 0.00102    0.000347       2.95   4.86e-3
## 2 Americas  gdpPercap 0.000270   0.0000769      3.51   1.87e-3
## 3 Asia      gdpPercap 0.000388   0.0000732      5.30   9.13e-6
## 4 Europe    gdpPercap 0.000215   0.0000251      8.54   2.80e-9
## 5 Oceania   gdpPercap 0.000111 NaN            NaN    NaN