tidyverse

Saayed Alam
December 12, 2018

Data Wrangling with dplyr

library(tidyverse)

#loading a dataset
library(gapminder)

#gapminder has 1704 rows and 6 variables
gapminder
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # ... with 1,694 more rows

Five main verbs of data wrangling

filter()

#selecting a subset of the rows of a data frame
gapminder %>%
  filter(year == 1987) %>%
  head()
## # A tibble: 6 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1987    40.8 13867957      852.
## 2 Albania     Europe     1987    72    3075321     3739.
## 3 Algeria     Africa     1987    65.8 23254956     5681.
## 4 Angola      Africa     1987    39.9  7874230     2430.
## 5 Argentina   Americas   1987    70.8 31620918     9140.
## 6 Australia   Oceania    1987    76.3 16257249    21889.

summarise()

#numerical summary applied to a column
gapminder %>%
  filter(year == 1987 & country == 'Bangladesh') %>%
  summarise(Max_Life_Expectancy = max(lifeExp))
## # A tibble: 1 x 1
##   Max_Life_Expectancy
##                 <dbl>
## 1                52.8

groupby()

#numerical summary for all levels of a categorical column
gapminder %>%
  filter(year == 1987) %>%
  group_by(continent) %>%
  summarise(Max_Life_Expectancy = max(lifeExp))
## # A tibble: 5 x 2
##   continent Max_Life_Expectancy
##   <fct>                   <dbl>
## 1 Africa                   71.9
## 2 Americas                 76.9
## 3 Asia                     78.7
## 4 Europe                   77.4
## 5 Oceania                  76.3

mutate()

#creating a new variable based on other variables
gapminder %>%
  mutate(gdp = pop * gdpPercap) %>%
  head()
## # A tibble: 6 x 7
##   country     continent  year lifeExp      pop gdpPercap          gdp
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>        <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.  6567086330.
## 2 Afghanistan Asia       1957    30.3  9240934      821.  7585448670.
## 3 Afghanistan Asia       1962    32.0 10267083      853.  8758855797.
## 4 Afghanistan Asia       1967    34.0 11537966      836.  9648014150.
## 5 Afghanistan Asia       1972    36.1 13079460      740.  9678553274.
## 6 Afghanistan Asia       1977    38.4 14880372      786. 11697659231.

arrange()

#reordering the rows based on values of one or more columns
gapminder %>%
  filter(year < 2000) %>%
  arrange(desc(lifeExp)) %>%
  head()
## # A tibble: 6 x 6
##   country          continent  year lifeExp       pop gdpPercap
##   <fct>            <fct>     <int>   <dbl>     <int>     <dbl>
## 1 Japan            Asia       1997    80.7 125956499    28817.
## 2 Hong Kong, China Asia       1997    80     6495918    28378.
## 3 Sweden           Europe     1997    79.4   8897619    25267.
## 4 Switzerland      Europe     1997    79.4   7193761    32135.
## 5 Japan            Asia       1992    79.4 124329269    26825.
## 6 Iceland          Europe     1997    79.0    271192    28061.