library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(babynames)

# Toy data
cases <- tribble(
  ~Country, ~"2011", ~"2012", ~"2013",
      "FR",    7000,    6900,    7000,
      "DE",    5800,    6000,    6200,
      "US",   15000,   14000,   13000
)

pollution <- tribble(
       ~city, ~size, ~amount,
  "New York", "large",      23,
  "New York", "small",      14,
    "London", "large",      22,
    "London", "small",      16,
   "Beijing", "large",     121,
   "Beijing", "small",     121
)

names(who) <- stringr::str_replace(names(who), 
                                   "newrel", 
                                   "new_rel")
table1
## # A tibble: 6 × 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
table2
## # A tibble: 12 × 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583
table3
## # A tibble: 6 × 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583
table4a
## # A tibble: 3 × 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766
table4b
## # A tibble: 3 × 3
##   country         `1999`     `2000`
## * <chr>            <int>      <int>
## 1 Afghanistan   19987071   20595360
## 2 Brazil       172006362  174504898
## 3 China       1272915272 1280428583
table5
## # A tibble: 6 × 4
##   country     century year  rate             
## * <chr>       <chr>   <chr> <chr>            
## 1 Afghanistan 19      99    745/19987071     
## 2 Afghanistan 20      00    2666/20595360    
## 3 Brazil      19      99    37737/172006362  
## 4 Brazil      20      00    80488/174504898  
## 5 China       19      99    212258/1272915272
## 6 China       20      00    213766/1280428583

tidyr

Your Turn 1

On a sheet of paper, draw how the cases data set would look if it had the same values grouped into three columns: country, year, n

Your Turn 2

Use gather() to reorganize table4a into three columns: country, year, and cases.

table4a %>%
  gather(key = "year", value = "n", 2:3)
## # A tibble: 6 × 3
##   country     year       n
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Brazil      1999   37737
## 3 China       1999  212258
## 4 Afghanistan 2000    2666
## 5 Brazil      2000   80488
## 6 China       2000  213766

Your Turn 3

On a sheet of paper, draw how this data set would look if it had the same values grouped into three columns: city, large, small

Your Turn 4

Use spread() to reorganize table2 into four columns: country, year, cases, and population.

table2 %>%
  spread(key = type, value = count)
## # A tibble: 6 × 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

who

Your Turn 5

Gather the 5th through 60th columns of who into a pair of key:value columns named codes and n.

Then select just the county, year, codes and n variables.

who %>%
  gather(key = "codes", value = "n", 5:60) %>%
  select(-iso2, -iso3)
## # A tibble: 405,440 × 4
##    country      year codes           n
##    <chr>       <int> <chr>       <int>
##  1 Afghanistan  1980 new_sp_m014    NA
##  2 Afghanistan  1981 new_sp_m014    NA
##  3 Afghanistan  1982 new_sp_m014    NA
##  4 Afghanistan  1983 new_sp_m014    NA
##  5 Afghanistan  1984 new_sp_m014    NA
##  6 Afghanistan  1985 new_sp_m014    NA
##  7 Afghanistan  1986 new_sp_m014    NA
##  8 Afghanistan  1987 new_sp_m014    NA
##  9 Afghanistan  1988 new_sp_m014    NA
## 10 Afghanistan  1989 new_sp_m014    NA
## # … with 405,430 more rows

Your Turn 6

Separate the sexage column into sex and age columns.

(Hint: Be sure to remove each _ before running the code)

who %>%
  gather("codes", "n", 5:60) %>%
  select(-iso2, -iso3) %>%
  separate(codes, c("new", "type", "sexage"), sep = "_") %>%
  select(-new) %>%
  separate(sexage, into = c("sex", "age"), sep = 1)
## # A tibble: 405,440 × 6
##    country      year type  sex   age       n
##    <chr>       <int> <chr> <chr> <chr> <int>
##  1 Afghanistan  1980 sp    m     014      NA
##  2 Afghanistan  1981 sp    m     014      NA
##  3 Afghanistan  1982 sp    m     014      NA
##  4 Afghanistan  1983 sp    m     014      NA
##  5 Afghanistan  1984 sp    m     014      NA
##  6 Afghanistan  1985 sp    m     014      NA
##  7 Afghanistan  1986 sp    m     014      NA
##  8 Afghanistan  1987 sp    m     014      NA
##  9 Afghanistan  1988 sp    m     014      NA
## 10 Afghanistan  1989 sp    m     014      NA
## # … with 405,430 more rows

Reshaping Final Exam

Your Turn 7

Extend this code to reshape the data into a data set with three columns:

  1. year
  2. M
  3. F

Calculate the percent of male (or female) children by year. Then plot the percent over time.

babynames %>%
  group_by(year, sex) %>% 
  summarise(n = sum(n))
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
## # A tibble: 276 × 3
## # Groups:   year [138]
##     year sex        n
##    <dbl> <chr>  <int>
##  1  1880 F      90993
##  2  1880 M     110491
##  3  1881 F      91953
##  4  1881 M     100743
##  5  1882 F     107847
##  6  1882 M     113686
##  7  1883 F     112319
##  8  1883 M     104627
##  9  1884 F     129020
## 10  1884 M     114442
## # … with 266 more rows

Take Aways

Data comes in many formats but R prefers just one: tidy data.

A data set is tidy if and only if:

  1. Every variable is in its own column
  2. Every observation is in its own row
  3. Every value is in its own cell (which follows from the above)

What is a variable and an observation may depend on your immediate goal.