title: "Untitled" author: "Suma Pendyala" date: "07/04/2020" output: html_document

library(tidyverse)

## -- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.1     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

table1

## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

#> # A tibble: 6 x 4
#>   country      year  cases population
#>   <chr>       <int>  <int>      <int>
#> 1 Afghanistan  1999    745   19987071
#> 2 Afghanistan  2000   2666   20595360
#> 3 Brazil       1999  37737  172006362
#> 4 Brazil       2000  80488  174504898
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583
table2

## # A tibble: 12 x 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583

#> # A tibble: 12 x 4
#>   country      year type           count
#>   <chr>       <int> <chr>          <int>
#> 1 Afghanistan  1999 cases            745
#> 2 Afghanistan  1999 population  19987071
#> 3 Afghanistan  2000 cases           2666
#> 4 Afghanistan  2000 population  20595360
#> 5 Brazil       1999 cases          37737
#> 6 Brazil       1999 population 172006362
#> # . with 6 more rows
table3

## # A tibble: 6 x 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583

#> # A tibble: 6 x 3
#>   country      year rate             
#> * <chr>       <int> <chr>            
#> 1 Afghanistan  1999 745/19987071     
#> 2 Afghanistan  2000 2666/20595360    
#> 3 Brazil       1999 37737/172006362  
#> 4 Brazil       2000 80488/174504898  
#> 5 China        1999 212258/1272915272
#> 6 China        2000 213766/1280428583

# Spread across two tibbles
table4a  # cases

## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766

#> # A tibble: 3 x 3
#>   country     `1999` `2000`
#> * <chr>        <int>  <int>
#> 1 Afghanistan    745   2666
#> 2 Brazil       37737  80488
#> 3 China       212258 213766
table4b  # population

## # A tibble: 3 x 3
##   country         `1999`     `2000`
## * <chr>            <int>      <int>
## 1 Afghanistan   19987071   20595360
## 2 Brazil       172006362  174504898
## 3 China       1272915272 1280428583

#> # A tibble: 3 x 3
#>   country         `1999`     `2000`
#> * <chr>            <int>      <int>
#> 1 Afghanistan   19987071   20595360
#> 2 Brazil       172006362  174504898
#> 3 China       1272915272 1280428583

# Compute rate per 10,000
table1 %>%
  mutate(rate = cases / population * 10000)

## # A tibble: 6 x 5
##   country      year  cases population  rate
##   <chr>       <int>  <int>      <int> <dbl>
## 1 Afghanistan  1999    745   19987071 0.373
## 2 Afghanistan  2000   2666   20595360 1.29 
## 3 Brazil       1999  37737  172006362 2.19 
## 4 Brazil       2000  80488  174504898 4.61 
## 5 China        1999 212258 1272915272 1.67 
## 6 China        2000 213766 1280428583 1.67

#> # A tibble: 6 x 5
#>   country      year  cases population  rate
#>   <chr>       <int>  <int>      <int> <dbl>
#> 1 Afghanistan  1999    745   19987071 0.373
#> 2 Afghanistan  2000   2666   20595360 1.29 
#> 3 Brazil       1999  37737  172006362 2.19 
#> 4 Brazil       2000  80488  174504898 4.61 
#> 5 China        1999 212258 1272915272 1.67 
#> 6 China        2000 213766 1280428583 1.67

# Compute cases per year
table1 %>%
  count(year, wt = cases)

## # A tibble: 2 x 2
##    year      n
##   <int>  <int>
## 1  1999 250740
## 2  2000 296920

#> # A tibble: 2 x 2
#>    year      n
#>   <int>  <int>
#> 1  1999 250740
#> 2  2000 296920

# Visualise changes over time
library(ggplot2)
ggplot(table1, aes(year, cases)) +
  geom_line(aes(group = country), colour = "grey50") +
  geom_point(aes(colour = country))

1 - Using prose, describe how the variables and observations are organised in each of the sample tables.

In table1, each observation has its own row and each variable has its own column.

#> # A tibble: 6 <U+643C><U+3E37> 4
#>       country  year  cases population
#>                  
#> 1 Afghanistan  1999    745   19987071
#> 2 Afghanistan  2000   2666   20595360
#> 3      Brazil  1999  37737  172006362
#> 4      Brazil  2000  80488  174504898
#> 5       China  1999 212258 1272915272
#> 6       China  2000 213766 1280428583

In table2, each row represents the country, year, and the variable type of either case or population. The variable count represents the unique value for the variable type.

#> # A tibble: 12 <U+643C><U+3E37> 4
#>       country  year       type     count
#>                     
#> 1 Afghanistan  1999      cases       745
#> 2 Afghanistan  1999 population  19987071
#> 3 Afghanistan  2000      cases      2666
#> 4 Afghanistan  2000 population  20595360
#> 5      Brazil  1999      cases     37737
#> 6      Brazil  1999 population 172006362

#> # A tibble: 6 <U+643C><U+3E37> 3
#>       country  year              rate
#> *                     
#> 1 Afghanistan  1999      745/19987071
#> 2 Afghanistan  2000     2666/20595360
#> 3      Brazil  1999   37737/172006362
#> 4      Brazil  2000   80488/174504898
#> 5       China  1999 212258/1272915272
#> 6       China  2000 213766/1280428583

In table4, cases and population are represented in separate tables. Years 1999 and 2000 are treated as different variables.

table4a  # cases

## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766

#> # A tibble: 3 <U+643C><U+3E37> 3
#>       country `1999` `2000`
#> *           
#> 1 Afghanistan    745   2666
#> 2      Brazil  37737  80488
#> 3       China 212258 213766
table4b  # population

## # A tibble: 3 x 3
##   country         `1999`     `2000`
## * <chr>            <int>      <int>
## 1 Afghanistan   19987071   20595360
## 2 Brazil       172006362  174504898
## 3 China       1272915272 1280428583

#> # A tibble: 3 <U+643C><U+3E37> 3
#>       country     `1999`     `2000`
#> *                   
#> 1 Afghanistan   19987071   20595360
#> 2      Brazil  172006362  174504898
#> 3       China 1272915272 1280428583

countries <- filter(table2, type == 'cases')$country
years <- filter(table2, type == 'cases')$year
cases <- filter(table2, type == 'cases')$count
populations <- filter(table2, type == 'population')$count

table2_rate <- tibble(country = countries,
                      year = years,
                      rate = cases/populations * 10000)

table2_rate

## # A tibble: 6 x 3
##   country      year  rate
##   <chr>       <int> <dbl>
## 1 Afghanistan  1999 0.373
## 2 Afghanistan  2000 1.29 
## 3 Brazil       1999 2.19 
## 4 Brazil       2000 4.61 
## 5 China        1999 1.67 
## 6 China        2000 1.67

countries <- table4a$country
cases_1999 <- table4a$`1999`
cases_2000 <- table4a$`2000`
populations_1999 <- table4b$`1999`
populations_2000 <- table4b$`2000`

table_1999_rate <- tibble(country = countries,
                          year = 1999,
                          rate = cases_1999 / populations_1999 * 10000)

table_2000_rate <- tibble(country = countries,
                          year = 2000,
                          rate = cases_2000 / populations_2000 * 10000)

table4_rate <- rbind(table_1999_rate, table_2000_rate) %>% arrange(country)

table4_rate

## # A tibble: 6 x 3
##   country      year  rate
##   <chr>       <dbl> <dbl>
## 1 Afghanistan  1999 0.373
## 2 Afghanistan  2000 1.29 
## 3 Brazil       1999 2.19 
## 4 Brazil       2000 4.61 
## 5 China        1999 1.67 
## 6 China        2000 1.67

ggplot(data = filter(table2, type == 'cases'),
       mapping = aes(x = year, y= count)) +
  geom_line(mapping = aes(group = country),
            color = 'grey50') +
  geom_point(mapping = aes(color = country)) +
  labs(y = 'cases') +
  scale_x_continuous(breaks = (c(1999,2000)))

table4a

## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766

#> # A tibble: 3 x 3
#>   country     `1999` `2000`
#> * <chr>        <int>  <int>
#> 1 Afghanistan    745   2666
#> 2 Brazil       37737  80488
#> 3 China       212258 213766

table4a %>%
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")

## # A tibble: 6 x 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Afghanistan 2000    2666
## 3 Brazil      1999   37737
## 4 Brazil      2000   80488
## 5 China       1999  212258
## 6 China       2000  213766

#> # A tibble: 6 x 3
#>   country     year   cases
#>   <chr>       <chr>  <int>
#> 1 Afghanistan 1999     745
#> 2 Afghanistan 2000    2666
#> 3 Brazil      1999   37737
#> 4 Brazil      2000   80488
#> 5 China       1999  212258
#> 6 China       2000  213766

table4b %>%
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "population")

## # A tibble: 6 x 3
##   country     year  population
##   <chr>       <chr>      <int>
## 1 Afghanistan 1999    19987071
## 2 Afghanistan 2000    20595360
## 3 Brazil      1999   172006362
## 4 Brazil      2000   174504898
## 5 China       1999  1272915272
## 6 China       2000  1280428583

#> # A tibble: 6 x 3
#>   country     year  population
#>   <chr>       <chr>      <int>
#> 1 Afghanistan 1999    19987071
#> 2 Afghanistan 2000    20595360
#> 3 Brazil      1999   172006362
#> 4 Brazil      2000   174504898
#> 5 China       1999  1272915272
#> 6 China       2000  1280428583

tidy4a <- table4a %>%
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
tidy4b <- table4b %>%
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "population")
left_join(tidy4a, tidy4b)

## Joining, by = c("country", "year")

## # A tibble: 6 x 4
##   country     year   cases population
##   <chr>       <chr>  <int>      <int>
## 1 Afghanistan 1999     745   19987071
## 2 Afghanistan 2000    2666   20595360
## 3 Brazil      1999   37737  172006362
## 4 Brazil      2000   80488  174504898
## 5 China       1999  212258 1272915272
## 6 China       2000  213766 1280428583

#> Joining, by = c("country", "year")
#> # A tibble: 6 x 4
#>   country     year   cases population
#>   <chr>       <chr>  <int>      <int>
#> 1 Afghanistan 1999     745   19987071
#> 2 Afghanistan 2000    2666   20595360
#> 3 Brazil      1999   37737  172006362
#> 4 Brazil      2000   80488  174504898
#> 5 China       1999  212258 1272915272
#> 6 China       2000  213766 1280428583

table2

## # A tibble: 12 x 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583

#> # A tibble: 12 x 4
#>   country      year type           count
#>   <chr>       <int> <chr>          <int>
#> 1 Afghanistan  1999 cases            745
#> 2 Afghanistan  1999 population  19987071
#> 3 Afghanistan  2000 cases           2666
#> 4 Afghanistan  2000 population  20595360
#> 5 Brazil       1999 cases          37737
#> 6 Brazil       1999 population 172006362
#> # . with 6 more rows

table2 %>%
    pivot_wider(names_from = type, values_from = count)

## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

#> # A tibble: 6 x 4
#>   country      year  cases population
#>   <chr>       <int>  <int>      <int>
#> 1 Afghanistan  1999    745   19987071
#> 2 Afghanistan  2000   2666   20595360
#> 3 Brazil       1999  37737  172006362
#> 4 Brazil       2000  80488  174504898
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583

1- Why are pivot_longer() and pivot_wider() not perfectly symmetrical? Carefully consider the following example

stocks <- tibble(
  year   = c(2015, 2015, 2016, 2016),
  half  = c(   1,    2,     1,    2),
  return = c(1.88, 0.59, 0.92, 0.17)
)
stocks %>%
  pivot_wider(names_from = year, values_from = return) %>%
  pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return")

## # A tibble: 4 x 3
##    half year  return
##   <dbl> <chr>  <dbl>
## 1     1 2015    1.88
## 2     1 2016    0.92
## 3     2 2015    0.59
## 4     2 2016    0.17

table4a %>%
  pivot_longer(c(1999, 2000), names_to = "year", values_to = "cases")

## Error: Can't subset columns that don't exist.
## [31mx[39m Locations 1999 and 2000 don't exist.
## [34mi[39m There are only 3 columns.

#> Error: Can't subset columns that don't exist.
#> ??? The locations 1999 and 2000 don't exist.
#> ??? There are only 3 columns.

What would happen if you widen this table? Why? How could you add a new column to uniquely identify each value?

people <- tribble(
  ~name,             ~names,  ~values,
  #-----------------|--------|------
  "Phillip Woods",   "age",       45,
  "Phillip Woods",   "height",   186,
  "Phillip Woods",   "age",       50,
  "Jessica Cordero", "age",       37,
  "Jessica Cordero", "height",   156
)

Tidy the simple tibble below. Do you need to make it wider or longer? What are the variables?

preg <- tribble(
  ~pregnant, ~male, ~female,
  "yes",     NA,    10,
  "no",      20,    12
)

table3

## # A tibble: 6 x 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583

#> # A tibble: 6 x 3
#>   country      year rate             
#> * <chr>       <int> <chr>            
#> 1 Afghanistan  1999 745/19987071     
#> 2 Afghanistan  2000 2666/20595360    
#> 3 Brazil       1999 37737/172006362  
#> 4 Brazil       2000 80488/174504898  
#> 5 China        1999 212258/1272915272
#> 6 China        2000 213766/1280428583

table3 %>%
  separate(rate, into = c("cases", "population"))

## # A tibble: 6 x 4
##   country      year cases  population
##   <chr>       <int> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

#> # A tibble: 6 x 4
#>   country      year cases  population
#>   <chr>       <int> <chr>  <chr>     
#> 1 Afghanistan  1999 745    19987071  
#> 2 Afghanistan  2000 2666   20595360  
#> 3 Brazil       1999 37737  172006362 
#> 4 Brazil       2000 80488  174504898 
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583

table3 %>%
  separate(rate, into = c("cases", "population"), sep = "/")

## # A tibble: 6 x 4
##   country      year cases  population
##   <chr>       <int> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

table3 %>%
  separate(rate, into = c("cases", "population"), convert = TRUE)

## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

#> # A tibble: 6 x 4
#>   country      year  cases population
#>   <chr>       <int>  <int>      <int>
#> 1 Afghanistan  1999    745   19987071
#> 2 Afghanistan  2000   2666   20595360
#> 3 Brazil       1999  37737  172006362
#> 4 Brazil       2000  80488  174504898
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583

table3 %>%
  separate(year, into = c("century", "year"), sep = 2)

## # A tibble: 6 x 4
##   country     century year  rate             
##   <chr>       <chr>   <chr> <chr>            
## 1 Afghanistan 19      99    745/19987071     
## 2 Afghanistan 20      00    2666/20595360    
## 3 Brazil      19      99    37737/172006362  
## 4 Brazil      20      00    80488/174504898  
## 5 China       19      99    212258/1272915272
## 6 China       20      00    213766/1280428583

#> # A tibble: 6 x 4
#>   country     century year  rate             
#>   <chr>       <chr>   <chr> <chr>            
#> 1 Afghanistan 19      99    745/19987071     
#> 2 Afghanistan 20      00    2666/20595360    
#> 3 Brazil      19      99    37737/172006362  
#> 4 Brazil      20      00    80488/174504898  
#> 5 China       19      99    212258/1272915272
#> 6 China       20      00    213766/1280428583

table5 %>%
  unite(new, century, year)

## # A tibble: 6 x 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 19_99 745/19987071     
## 2 Afghanistan 20_00 2666/20595360    
## 3 Brazil      19_99 37737/172006362  
## 4 Brazil      20_00 80488/174504898  
## 5 China       19_99 212258/1272915272
## 6 China       20_00 213766/1280428583

#> # A tibble: 6 x 3
#>   country     new   rate             
#>   <chr>       <chr> <chr>            
#> 1 Afghanistan 19_99 745/19987071     
#> 2 Afghanistan 20_00 2666/20595360    
#> 3 Brazil      19_99 37737/172006362  
#> 4 Brazil      20_00 80488/174504898  
#> 5 China       19_99 212258/1272915272
#> 6 China       20_00 213766/1280428583

table5 %>%
  unite(new, century, year, sep = "")

## # A tibble: 6 x 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 1999  745/19987071     
## 2 Afghanistan 2000  2666/20595360    
## 3 Brazil      1999  37737/172006362  
## 4 Brazil      2000  80488/174504898  
## 5 China       1999  212258/1272915272
## 6 China       2000  213766/1280428583

#> # A tibble: 6 x 3
#>   country     new   rate             
#>   <chr>       <chr> <chr>            
#> 1 Afghanistan 1999  745/19987071     
#> 2 Afghanistan 2000  2666/20595360    
#> 3 Brazil      1999  37737/172006362  
#> 4 Brazil      2000  80488/174504898  
#> 5 China       1999  212258/1272915272
#> 6 China       2000  213766/1280428583

What do the extra and fill arguments do in separate()? Experiment with the various options for the following two toy datasets.

tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
  separate(x, c("one", "two", "three"))

## Warning: Expected 3 pieces. Additional pieces discarded in 1 rows [2].

## # A tibble: 3 x 3
##   one   two   three
##   <chr> <chr> <chr>
## 1 a     b     c    
## 2 d     e     f    
## 3 h     i     j

tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
  separate(x, c("one", "two", "three"))

## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].

## # A tibble: 3 x 3
##   one   two   three
##   <chr> <chr> <chr>
## 1 a     b     c    
## 2 d     e     <NA> 
## 3 f     g     i

Both unite() and separate() have a remove argument. What does it do? Why would you set it to FALSE?

table3 %>%
  separate(year, into = c("century", "year"), sep = 2, remove = FALSE)

## # A tibble: 6 x 4
##   country     rate              century year 
##   <chr>       <chr>             <chr>   <chr>
## 1 Afghanistan 745/19987071      19      99   
## 2 Afghanistan 2666/20595360     20      00   
## 3 Brazil      37737/172006362   19      99   
## 4 Brazil      80488/174504898   20      00   
## 5 China       212258/1272915272 19      99   
## 6 China       213766/1280428583 20      00

table5 %>%
  unite(new, century, year, sep = "", remove = FALSE)

## # A tibble: 6 x 5
##   country     new   century year  rate             
##   <chr>       <chr> <chr>   <chr> <chr>            
## 1 Afghanistan 1999  19      99    745/19987071     
## 2 Afghanistan 2000  20      00    2666/20595360    
## 3 Brazil      1999  19      99    37737/172006362  
## 4 Brazil      2000  20      00    80488/174504898  
## 5 China       1999  19      99    212258/1272915272
## 6 China       2000  20      00    213766/1280428583

3 - Compare and contrast separate() and extract(). Why are there three variations of separation (by position, by separator, and with groups), but only one unite?

stocks <- tibble(
  year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
  qtr    = c(   1,    2,    3,    4,    2,    3,    4),
  return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)

stocks %>%
  pivot_wider(names_from = year, values_from = return)

## # A tibble: 4 x 3
##     qtr `2015` `2016`
##   <dbl>  <dbl>  <dbl>
## 1     1   1.88  NA   
## 2     2   0.59   0.92
## 3     3   0.35   0.17
## 4     4  NA      2.66

#> # A tibble: 4 x 3
#>     qtr `2015` `2016`
#>   <dbl>  <dbl>  <dbl>
#> 1     1   1.88  NA   
#> 2     2   0.59   0.92
#> 3     3   0.35   0.17
#> 4     4  NA      2.66

stocks %>%
  pivot_wider(names_from = year, values_from = return) %>%
  pivot_longer(
    cols = c(`2015`, `2016`),
    names_to = "year",
    values_to = "return",
    values_drop_na = TRUE
  )

## # A tibble: 6 x 3
##     qtr year  return
##   <dbl> <chr>  <dbl>
## 1     1 2015    1.88
## 2     2 2015    0.59
## 3     2 2016    0.92
## 4     3 2015    0.35
## 5     3 2016    0.17
## 6     4 2016    2.66

#> # A tibble: 6 x 3
#>     qtr year  return
#>   <dbl> <chr>  <dbl>
#> 1     1 2015    1.88
#> 2     2 2015    0.59
#> 3     2 2016    0.92
#> 4     3 2015    0.35
#> 5     3 2016    0.17
#> 6     4 2016    2.66

stocks %>%
  complete(year, qtr)

## # A tibble: 8 x 3
##    year   qtr return
##   <dbl> <dbl>  <dbl>
## 1  2015     1   1.88
## 2  2015     2   0.59
## 3  2015     3   0.35
## 4  2015     4  NA   
## 5  2016     1  NA   
## 6  2016     2   0.92
## 7  2016     3   0.17
## 8  2016     4   2.66

#> # A tibble: 8 x 3
#>    year   qtr return
#>   <dbl> <dbl>  <dbl>
#> 1  2015     1   1.88
#> 2  2015     2   0.59
#> 3  2015     3   0.35
#> 4  2015     4  NA   
#> 5  2016     1  NA   
#> 6  2016     2   0.92
#> # . with 2 more rows

treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 1,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)

treatment %>%
  fill(person)

## # A tibble: 4 x 3
##   person           treatment response
##   <chr>                <dbl>    <dbl>
## 1 Derrick Whitmore         1        7
## 2 Derrick Whitmore         2       10
## 3 Derrick Whitmore         3        9
## 4 Katherine Burke          1        4

#> # A tibble: 4 x 3
#>   person           treatment response
#>   <chr>                <dbl>    <dbl>
#> 1 Derrick Whitmore         1        7
#> 2 Derrick Whitmore         2       10
#> 3 Derrick Whitmore         3        9
#> 4 Katherine Burke          1        4

1 - Compare and contrast the fill arguments to spread() and complete(). In spread(), all NAs will be replaced by the fill value. The fill argument only takes in one value. In complete(), NAs under different variables can be replaced by different values. The fill argument takes in a list that specifies the values to replace NA for different variables. 2 - What does the direction argument to fill() do? The default value is down. Any NAs will be replaced by the previous non-missing value. The filling direction can be reversed if .direction is set to up.

who

## # A tibble: 7,240 x 60
##    country iso2  iso3   year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544
##    <chr>   <chr> <chr> <int>       <int>        <int>        <int>        <int>
##  1 Afghan~ AF    AFG    1980          NA           NA           NA           NA
##  2 Afghan~ AF    AFG    1981          NA           NA           NA           NA
##  3 Afghan~ AF    AFG    1982          NA           NA           NA           NA
##  4 Afghan~ AF    AFG    1983          NA           NA           NA           NA
##  5 Afghan~ AF    AFG    1984          NA           NA           NA           NA
##  6 Afghan~ AF    AFG    1985          NA           NA           NA           NA
##  7 Afghan~ AF    AFG    1986          NA           NA           NA           NA
##  8 Afghan~ AF    AFG    1987          NA           NA           NA           NA
##  9 Afghan~ AF    AFG    1988          NA           NA           NA           NA
## 10 Afghan~ AF    AFG    1989          NA           NA           NA           NA
## # ... with 7,230 more rows, and 52 more variables: new_sp_m4554 <int>,
## #   new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,
## #   new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
## #   new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
## #   new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
## #   new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
## #   new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
## #   new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,
## #   new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,
## #   new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,
## #   new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
## #   new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,
## #   new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,
## #   new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,
## #   newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,
## #   newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,
## #   newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,
## #   newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>

#> # A tibble: 7,240 x 60
#>   country iso2  iso3   year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544
#>   <chr>   <chr> <chr> <int>       <int>        <int>        <int>        <int>
#> 1 Afghan. AF    AFG    1980          NA           NA           NA           NA
#> 2 Afghan. AF    AFG    1981          NA           NA           NA           NA
#> 3 Afghan. AF    AFG    1982          NA           NA           NA           NA
#> 4 Afghan. AF    AFG    1983          NA           NA           NA           NA
#> 5 Afghan. AF    AFG    1984          NA           NA           NA           NA
#> 6 Afghan. AF    AFG    1985          NA           NA           NA           NA
#> # . with 7,234 more rows, and 52 more variables: new_sp_m4554 <int>,
#> #   new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,
#> #   new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
#> #   new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
#> #   new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
#> #   new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
#> #   new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
#> #   new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,
#> #   new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,
#> #   new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,
#> #   new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
#> #   new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,
#> #   new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,
#> #   new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,
#> #   newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,
#> #   newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,
#> #   newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,
#> #   newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>

who1 <- who %>%
  pivot_longer(
    cols = new_sp_m014:newrel_f65,
    names_to = "key",
    values_to = "cases",
    values_drop_na = TRUE
  )
who1

## # A tibble: 76,046 x 6
##    country     iso2  iso3   year key          cases
##    <chr>       <chr> <chr> <int> <chr>        <int>
##  1 Afghanistan AF    AFG    1997 new_sp_m014      0
##  2 Afghanistan AF    AFG    1997 new_sp_m1524    10
##  3 Afghanistan AF    AFG    1997 new_sp_m2534     6
##  4 Afghanistan AF    AFG    1997 new_sp_m3544     3
##  5 Afghanistan AF    AFG    1997 new_sp_m4554     5
##  6 Afghanistan AF    AFG    1997 new_sp_m5564     2
##  7 Afghanistan AF    AFG    1997 new_sp_m65       0
##  8 Afghanistan AF    AFG    1997 new_sp_f014      5
##  9 Afghanistan AF    AFG    1997 new_sp_f1524    38
## 10 Afghanistan AF    AFG    1997 new_sp_f2534    36
## # ... with 76,036 more rows

#> # A tibble: 76,046 x 6
#>   country     iso2  iso3   year key          cases
#>   <chr>       <chr> <chr> <int> <chr>        <int>
#> 1 Afghanistan AF    AFG    1997 new_sp_m014      0
#> 2 Afghanistan AF    AFG    1997 new_sp_m1524    10
#> 3 Afghanistan AF    AFG    1997 new_sp_m2534     6
#> 4 Afghanistan AF    AFG    1997 new_sp_m3544     3
#> 5 Afghanistan AF    AFG    1997 new_sp_m4554     5
#> 6 Afghanistan AF    AFG    1997 new_sp_m5564     2
#> # . with 76,040 more rows

who1 %>%
  count(key)

## # A tibble: 56 x 2
##    key              n
##    <chr>        <int>
##  1 new_ep_f014   1032
##  2 new_ep_f1524  1021
##  3 new_ep_f2534  1021
##  4 new_ep_f3544  1021
##  5 new_ep_f4554  1017
##  6 new_ep_f5564  1017
##  7 new_ep_f65    1014
##  8 new_ep_m014   1038
##  9 new_ep_m1524  1026
## 10 new_ep_m2534  1020
## # ... with 46 more rows

#> # A tibble: 56 x 2
#>   key              n
#>   <chr>        <int>
#> 1 new_ep_f014   1032
#> 2 new_ep_f1524  1021
#> 3 new_ep_f2534  1021
#> 4 new_ep_f3544  1021
#> 5 new_ep_f4554  1017
#> 6 new_ep_f5564  1017
#> # . with 50 more rows

who2 <- who1 %>%
  mutate(names_from = stringr::str_replace(key, "newrel", "new_rel"))
who2

## # A tibble: 76,046 x 7
##    country     iso2  iso3   year key          cases names_from  
##    <chr>       <chr> <chr> <int> <chr>        <int> <chr>       
##  1 Afghanistan AF    AFG    1997 new_sp_m014      0 new_sp_m014 
##  2 Afghanistan AF    AFG    1997 new_sp_m1524    10 new_sp_m1524
##  3 Afghanistan AF    AFG    1997 new_sp_m2534     6 new_sp_m2534
##  4 Afghanistan AF    AFG    1997 new_sp_m3544     3 new_sp_m3544
##  5 Afghanistan AF    AFG    1997 new_sp_m4554     5 new_sp_m4554
##  6 Afghanistan AF    AFG    1997 new_sp_m5564     2 new_sp_m5564
##  7 Afghanistan AF    AFG    1997 new_sp_m65       0 new_sp_m65  
##  8 Afghanistan AF    AFG    1997 new_sp_f014      5 new_sp_f014 
##  9 Afghanistan AF    AFG    1997 new_sp_f1524    38 new_sp_f1524
## 10 Afghanistan AF    AFG    1997 new_sp_f2534    36 new_sp_f2534
## # ... with 76,036 more rows

#> # A tibble: 76,046 x 7
#>   country     iso2  iso3   year key          cases names_from  
#>   <chr>       <chr> <chr> <int> <chr>        <int> <chr>       
#> 1 Afghanistan AF    AFG    1997 new_sp_m014      0 new_sp_m014 
#> 2 Afghanistan AF    AFG    1997 new_sp_m1524    10 new_sp_m1524
#> 3 Afghanistan AF    AFG    1997 new_sp_m2534     6 new_sp_m2534
#> 4 Afghanistan AF    AFG    1997 new_sp_m3544     3 new_sp_m3544
#> 5 Afghanistan AF    AFG    1997 new_sp_m4554     5 new_sp_m4554
#> 6 Afghanistan AF    AFG    1997 new_sp_m5564     2 new_sp_m5564
#> # . with 76,040 more rows

who3 <- who2 %>%
  separate(key, c("new", "type", "sexage"), sep = "_")

## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2580 rows [243,
## 244, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 903,
## 904, 905, 906, ...].

#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2580 rows [243,
#> 244, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 903,
#> 904, 905, 906, ...].
who3

## # A tibble: 76,046 x 9
##    country     iso2  iso3   year new   type  sexage cases names_from  
##    <chr>       <chr> <chr> <int> <chr> <chr> <chr>  <int> <chr>       
##  1 Afghanistan AF    AFG    1997 new   sp    m014       0 new_sp_m014 
##  2 Afghanistan AF    AFG    1997 new   sp    m1524     10 new_sp_m1524
##  3 Afghanistan AF    AFG    1997 new   sp    m2534      6 new_sp_m2534
##  4 Afghanistan AF    AFG    1997 new   sp    m3544      3 new_sp_m3544
##  5 Afghanistan AF    AFG    1997 new   sp    m4554      5 new_sp_m4554
##  6 Afghanistan AF    AFG    1997 new   sp    m5564      2 new_sp_m5564
##  7 Afghanistan AF    AFG    1997 new   sp    m65        0 new_sp_m65  
##  8 Afghanistan AF    AFG    1997 new   sp    f014       5 new_sp_f014 
##  9 Afghanistan AF    AFG    1997 new   sp    f1524     38 new_sp_f1524
## 10 Afghanistan AF    AFG    1997 new   sp    f2534     36 new_sp_f2534
## # ... with 76,036 more rows

#> # A tibble: 76,046 x 9
#>   country     iso2  iso3   year new   type  sexage cases names_from  
#>   <chr>       <chr> <chr> <int> <chr> <chr> <chr>  <int> <chr>       
#> 1 Afghanistan AF    AFG    1997 new   sp    m014       0 new_sp_m014 
#> 2 Afghanistan AF    AFG    1997 new   sp    m1524     10 new_sp_m1524
#> 3 Afghanistan AF    AFG    1997 new   sp    m2534      6 new_sp_m2534
#> 4 Afghanistan AF    AFG    1997 new   sp    m3544      3 new_sp_m3544
#> 5 Afghanistan AF    AFG    1997 new   sp    m4554      5 new_sp_m4554
#> 6 Afghanistan AF    AFG    1997 new   sp    m5564      2 new_sp_m5564
#> # . with 76,040 more rows

who3 %>%
  count(new)

## # A tibble: 2 x 2
##   new        n
##   <chr>  <int>
## 1 new    73466
## 2 newrel  2580

#> # A tibble: 2 x 2
#>   new        n
#>   <chr>  <int>
#> 1 new    73466
#> 2 newrel  2580
who4 <- who3 %>%
  select(-new, -iso2, -iso3)

who5 <- who4 %>%
  separate(sexage, c("sex", "age"), sep = 1)
who5

## # A tibble: 76,046 x 7
##    country      year type  sex   age   cases names_from  
##    <chr>       <int> <chr> <chr> <chr> <int> <chr>       
##  1 Afghanistan  1997 sp    m     014       0 new_sp_m014 
##  2 Afghanistan  1997 sp    m     1524     10 new_sp_m1524
##  3 Afghanistan  1997 sp    m     2534      6 new_sp_m2534
##  4 Afghanistan  1997 sp    m     3544      3 new_sp_m3544
##  5 Afghanistan  1997 sp    m     4554      5 new_sp_m4554
##  6 Afghanistan  1997 sp    m     5564      2 new_sp_m5564
##  7 Afghanistan  1997 sp    m     65        0 new_sp_m65  
##  8 Afghanistan  1997 sp    f     014       5 new_sp_f014 
##  9 Afghanistan  1997 sp    f     1524     38 new_sp_f1524
## 10 Afghanistan  1997 sp    f     2534     36 new_sp_f2534
## # ... with 76,036 more rows

#> # A tibble: 76,046 x 7
#>   country      year type  sex   age   cases names_from  
#>   <chr>       <int> <chr> <chr> <chr> <int> <chr>       
#> 1 Afghanistan  1997 sp    m     014       0 new_sp_m014 
#> 2 Afghanistan  1997 sp    m     1524     10 new_sp_m1524
#> 3 Afghanistan  1997 sp    m     2534      6 new_sp_m2534
#> 4 Afghanistan  1997 sp    m     3544      3 new_sp_m3544
#> 5 Afghanistan  1997 sp    m     4554      5 new_sp_m4554
#> 6 Afghanistan  1997 sp    m     5564      2 new_sp_m5564
#> # . with 76,040 more rows

who %>%
  pivot_longer(
    cols = new_sp_m014:newrel_f65,
    names_to = "key",
    values_to = "cases",
    values_drop_na = TRUE
  ) %>%
  mutate(
    key = stringr::str_replace(key, "newrel", "new_rel")
  ) %>%
  separate(key, c("new", "var", "sexage")) %>%
  select(-new, -iso2, -iso3) %>%
  separate(sexage, c("sex", "age"), sep = 1)

## # A tibble: 76,046 x 6
##    country      year var   sex   age   cases
##    <chr>       <int> <chr> <chr> <chr> <int>
##  1 Afghanistan  1997 sp    m     014       0
##  2 Afghanistan  1997 sp    m     1524     10
##  3 Afghanistan  1997 sp    m     2534      6
##  4 Afghanistan  1997 sp    m     3544      3
##  5 Afghanistan  1997 sp    m     4554      5
##  6 Afghanistan  1997 sp    m     5564      2
##  7 Afghanistan  1997 sp    m     65        0
##  8 Afghanistan  1997 sp    f     014       5
##  9 Afghanistan  1997 sp    f     1524     38
## 10 Afghanistan  1997 sp    f     2534     36
## # ... with 76,036 more rows

1- In this case study I set values_drop_na = TRUE just to make it easier to check that we had the correct values. Is this reasonable? Think about how missing values are represented in this dataset. Are there implicit missing values? What's the difference between an NA and zero?

who %>%
  group_by(country) %>%
  summarize(year_min = min(year), year_max = max(year)) %>%
  ggplot() +
  geom_point(mapping = aes(x = country, y = year_min), color = 'red') +
  geom_point(mapping = aes(x = country , y= year_max), color = 'blue') +
  coord_flip()

## `summarise()` ungrouping output (override with `.groups` argument)

who %>%
  group_by(country) %>%
  summarize(count = n()) %>%
  ggplot() +
  geom_point(mapping = aes(x = country, y = count), color = 'green') +
  coord_flip()

## `summarise()` ungrouping output (override with `.groups` argument)

sum(who %>% select(-c(1:4)) == 0, na.rm = TRUE)

## [1] 11080

who %>% select(-c(1:4)) %>%
  sapply(function(x){sum(is.na(x))})

##  new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544 new_sp_m4554 new_sp_m5564 
##         4067         4031         4034         4021         4017         4022 
##   new_sp_m65  new_sp_f014 new_sp_f1524 new_sp_f2534 new_sp_f3544 new_sp_f4554 
##         4031         4066         4046         4040         4041         4036 
## new_sp_f5564   new_sp_f65  new_sn_m014 new_sn_m1524 new_sn_m2534 new_sn_m3544 
##         4045         4043         6195         6210         6218         6215 
## new_sn_m4554 new_sn_m5564   new_sn_m65  new_sn_f014 new_sn_f1524 new_sn_f2534 
##         6213         6219         6220         6200         6218         6224 
## new_sn_f3544 new_sn_f4554 new_sn_f5564   new_sn_f65  new_ep_m014 new_ep_m1524 
##         6220         6222         6223         6221         6202         6214 
## new_ep_m2534 new_ep_m3544 new_ep_m4554 new_ep_m5564   new_ep_m65  new_ep_f014 
##         6220         6216         6220         6225         6222         6208 
## new_ep_f1524 new_ep_f2534 new_ep_f3544 new_ep_f4554 new_ep_f5564   new_ep_f65 
##         6219         6219         6219         6223         6223         6226 
##  newrel_m014 newrel_m1524 newrel_m2534 newrel_m3544 newrel_m4554 newrel_m5564 
##         7050         7058         7057         7056         7056         7055 
##   newrel_m65  newrel_f014 newrel_f1524 newrel_f2534 newrel_f3544 newrel_f4554 
##         7058         7050         7056         7058         7057         7057 
## newrel_f5564   newrel_f65 
##         7057         7055

who %>% select(1:3) %>% sapply(function(x){length(unique(x))})

## country    iso2    iso3 
##     219     219     219

who %>% select(1:3) %>%
  unite(combined, 1:3) %>%
  select(combined) %>%
  distinct() %>%
  nrow()

## [1] 219

who %>%
  gather(code, value, new_sp_m014:newrel_f65, na.rm = TRUE) %>%
  mutate(code = stringr::str_replace(code, "newrel", "new_rel")) %>%
  separate(code, c("new", "var", "sexage")) %>%
  select(-new, -iso2, -iso3) %>%
  separate(sexage, c("sex", "age"), sep = 1) %>%
  group_by(country, year, sex) %>%
  summarize(total_case = sum(value)) %>%
  unite(country_sex, country, sex, remove = FALSE) %>%
  ggplot() +
  geom_line(mapping = aes(x = year, y = total_case, color = sex,
                          group = country_sex))

## `summarise()` regrouping output by 'country', 'year' (override with `.groups` argument)