1. Purpose.

The purpose of this noteboook is to illustrate how the tidyr package can be used to tidy data into a structure that works best in R.

2. Load libraries.

library(dplyr)
library(tidyr)

3. Backround.

Tidy data is where:

It can help to think of it in terms of being easy to use in ggplot2. Could each column be a x, y, fill, colour, or facet variable?

Messy data is data where the 3 premises above do not hold true.

3. View messy data examples.

table1 #tidy data: each column is a variable & each row is an observation
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
table2 #messy data: each observation spread across 2 rows
## # A tibble: 12 x 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583
table3 #messy data: the rate column contains 2 variables
## # A tibble: 6 x 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583
table4a #messy data: the variable year is spread over 2 columns
## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766
table4b #messy data: the variable year is spread over 2 columns
## # A tibble: 3 x 3
##   country         `1999`     `2000`
## * <chr>            <int>      <int>
## 1 Afghanistan   19987071   20595360
## 2 Brazil       172006362  174504898
## 3 China       1272915272 1280428583

4. gather columns into a longer format.

gather(table4a, key="year", value="cases", `1999`:`2000`)
## # A tibble: 6 x 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Brazil      1999   37737
## 3 China       1999  212258
## 4 Afghanistan 2000    2666
## 5 Brazil      2000   80488
## 6 China       2000  213766
gather(table4b, key="year", value="population", `1999`:`2000`)
## # A tibble: 6 x 3
##   country     year  population
##   <chr>       <chr>      <int>
## 1 Afghanistan 1999    19987071
## 2 Brazil      1999   172006362
## 3 China       1999  1272915272
## 4 Afghanistan 2000    20595360
## 5 Brazil      2000   174504898
## 6 China       2000  1280428583
full_join(
  gather(table4a, key="year", value="cases", `1999`:`2000`),
  gather(table4b, key="year", value="population", `1999`:`2000`)
)
## # A tibble: 6 x 4
##   country     year   cases population
##   <chr>       <chr>  <int>      <int>
## 1 Afghanistan 1999     745   19987071
## 2 Brazil      1999   37737  172006362
## 3 China       1999  212258 1272915272
## 4 Afghanistan 2000    2666   20595360
## 5 Brazil      2000   80488  174504898
## 6 China       2000  213766 1280428583
spread(table2, key=type, value=count)
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
separate(table3, col=rate, into=c("cases", "population"))
## # A tibble: 6 x 4
##   country      year cases  population
## * <chr>       <int> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
unite(table3, col = country_year, country, year, sep = "_")
## # A tibble: 6 x 2
##   country_year     rate             
##   <chr>            <chr>            
## 1 Afghanistan_1999 745/19987071     
## 2 Afghanistan_2000 2666/20595360    
## 3 Brazil_1999      37737/172006362  
## 4 Brazil_2000      80488/174504898  
## 5 China_1999       212258/1272915272
## 6 China_2000       213766/1280428583

5. spread columns into a wider format.

spread(table2, key=type, value=count)
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

6. seperate 1 column into many.

separate(table3, col=rate, into=c("cases", "population"))
## # A tibble: 6 x 4
##   country      year cases  population
## * <chr>       <int> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

7. unite many columns into 1.

unite(table3, col = country_year, country, year, sep = "_")
## # A tibble: 6 x 2
##   country_year     rate             
##   <chr>            <chr>            
## 1 Afghanistan_1999 745/19987071     
## 2 Afghanistan_2000 2666/20595360    
## 3 Brazil_1999      37737/172006362  
## 4 Brazil_2000      80488/174504898  
## 5 China_1999       212258/1272915272
## 6 China_2000       213766/1280428583