Chapter 12 Tidy Data Notes

12.1.1 Prerequisites

library(tidyverse)
## ── Attaching packages ──────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

12.2 Tidy Data

# runs the tables
table1
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
table2
## # A tibble: 12 x 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583
table3
## # A tibble: 6 x 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583
table4a 
## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766
table4b 
## # A tibble: 3 x 3
##   country         `1999`     `2000`
## * <chr>            <int>      <int>
## 1 Afghanistan   19987071   20595360
## 2 Brazil       172006362  174504898
## 3 China       1272915272 1280428583

Dataset Tidy: Each variable must have its own column Each observation must have its own row Each value must have its own cell

# Compute rate per 10,000
table1 %>% 
  mutate(rate = cases / population * 10000)
## # A tibble: 6 x 5
##   country      year  cases population  rate
##   <chr>       <int>  <int>      <int> <dbl>
## 1 Afghanistan  1999    745   19987071 0.373
## 2 Afghanistan  2000   2666   20595360 1.29 
## 3 Brazil       1999  37737  172006362 2.19 
## 4 Brazil       2000  80488  174504898 4.61 
## 5 China        1999 212258 1272915272 1.67 
## 6 China        2000 213766 1280428583 1.67
# Compute cases per year
table1 %>% 
  count(year, wt = cases)
## # A tibble: 2 x 2
##    year      n
##   <int>  <int>
## 1  1999 250740
## 2  2000 296920

12.2.1 Exercises

#1 Using prose, describe how the variables and observations are organised in each of the sample tables.

#Columns are their own variables, meaning year only has year or country only has a country.
table1
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
# Combined columns, such as cases and population in the type column.
table2
## # A tibble: 12 x 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583
# Rate is a combine column of separated by a /.
table3
## # A tibble: 6 x 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583
# Contains the cases, with the the years separated into separate columns.
table4a 
## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766
# Contains the population.
table4b 
## # A tibble: 3 x 3
##   country         `1999`     `2000`
## * <chr>            <int>      <int>
## 1 Afghanistan   19987071   20595360
## 2 Brazil       172006362  174504898
## 3 China       1272915272 1280428583
#2 Compute the rate for table2, and table 4a+table 4b. 
  #1 Extract the bumber of TB case per country per year.
  #2 Extract the matching population per country per year.
  #3 Divide the cases by population and mulitply by 1000
  #4 Sore back in the appropraite place
# Which representation is easiest to work with? Which is hardest? Why?

#Table2
cases <- table2 %>%
  filter(type == "cases")
populations <- table2 %>%
  filter(type == "population")
merged <- cbind(cases, populations)
merged$rates= (cases$count / populations$count)*1000
merged
##       country year  type  count     country year       type      count
## 1 Afghanistan 1999 cases    745 Afghanistan 1999 population   19987071
## 2 Afghanistan 2000 cases   2666 Afghanistan 2000 population   20595360
## 3      Brazil 1999 cases  37737      Brazil 1999 population  172006362
## 4      Brazil 2000 cases  80488      Brazil 2000 population  174504898
## 5       China 1999 cases 212258       China 1999 population 1272915272
## 6       China 2000 cases 213766       China 2000 population 1280428583
##       rates
## 1 0.0372741
## 2 0.1294466
## 3 0.2193930
## 4 0.4612363
## 5 0.1667495
## 6 0.1669488
#Tables4
rates_99 <- (table4a$`1999` / table4b$`1999`)*1000
rates_00 <- (table4a$`2000`/ table4b$`2000`)*1000
new <- table4a['country']
new$rates_00 = rates_00
new$rates_99 = rates_99
new
## # A tibble: 3 x 3
##   country     rates_00 rates_99
##   <chr>          <dbl>    <dbl>
## 1 Afghanistan    0.129   0.0373
## 2 Brazil         0.461   0.219 
## 3 China          0.167   0.167
#Table 4a and 4b were easier to work with, whereas Table 2 had to be separated and filter before doing any of the calculations.

12.3 Pivoting

12.3.1 Longer

A common problem is a dataset where some of the column names are not names of variables, but values of a variable.

pivot_longer() makes wide tables narrower and longer

table4a
## # A tibble: 3 x 3
##   country     `1999` `2000`
## * <chr>        <int>  <int>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766
# Changes 1999 to year and 2000 to cases
table4a %>% 
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
## # A tibble: 6 x 3
##   country     year   cases
##   <chr>       <chr>  <int>
## 1 Afghanistan 1999     745
## 2 Afghanistan 2000    2666
## 3 Brazil      1999   37737
## 4 Brazil      2000   80488
## 5 China       1999  212258
## 6 China       2000  213766
table4b
## # A tibble: 3 x 3
##   country         `1999`     `2000`
## * <chr>            <int>      <int>
## 1 Afghanistan   19987071   20595360
## 2 Brazil       172006362  174504898
## 3 China       1272915272 1280428583
# Tidying table 4b by using pivot_longer
table4b %>% 
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "population")
## # A tibble: 6 x 3
##   country     year  population
##   <chr>       <chr>      <int>
## 1 Afghanistan 1999    19987071
## 2 Afghanistan 2000    20595360
## 3 Brazil      1999   172006362
## 4 Brazil      2000   174504898
## 5 China       1999  1272915272
## 6 China       2000  1280428583
# Combines the tidied table4a and table4b
tidy4a <- table4a %>% 
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
tidy4b <- table4b %>% 
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "population")
left_join(tidy4a, tidy4b)
## Joining, by = c("country", "year")
## # A tibble: 6 x 4
##   country     year   cases population
##   <chr>       <chr>  <int>      <int>
## 1 Afghanistan 1999     745   19987071
## 2 Afghanistan 2000    2666   20595360
## 3 Brazil      1999   37737  172006362
## 4 Brazil      2000   80488  174504898
## 5 China       1999  212258 1272915272
## 6 China       2000  213766 1280428583

12.3.2 Wider

pivot_wider() is the opposite of pivot_longer(). You use it when an observation is scattered across multiple rows.

pivot_wider() makes long tables shorter and wider.

table2
## # A tibble: 12 x 4
##    country      year type            count
##    <chr>       <int> <chr>           <int>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583
# separates type into two columns of case and population with the count as the values
table2 %>%
    pivot_wider(names_from = type, values_from = count)
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583

12.3.3 Exercises

#1 Why are pivot_longer and pivot_wider not perfectly symmetrical?
stocks <- tibble(
  year   = c(2015, 2015, 2016, 2016),
  half  = c(   1,    2,     1,    2),
  return = c(1.88, 0.59, 0.92, 0.17)
)
stocks %>% 
  pivot_wider(names_from = year, values_from = return) %>% 
  pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return")
## # A tibble: 4 x 3
##    half year  return
##   <dbl> <chr>  <dbl>
## 1     1 2015    1.88
## 2     1 2016    0.92
## 3     2 2015    0.59
## 4     2 2016    0.17
# Since the shape of the original tibble has changed and the orignal placement of columns and values. Pivot_wider() creates each year as it's own column instead of a double value column.
#2 Why does this code fail?

#table4a %>% 
#  pivot_longer(c(1999, 2000), names_to = "year", values_to = "cases")

# Fails because we tried to use column names as integers
#3 Why would happen if you widen this table? Why? How could you add a new column to uniquely identify each value?
people <- tribble(
  ~name,             ~names,  ~values,
  #-----------------|--------|------
  "Phillip Woods",   "age",       45,
  "Phillip Woods",   "height",   186,
  "Phillip Woods",   "age",       50,
  "Jessica Cordero", "age",       37,
  "Jessica Cordero", "height",   156
)
# Receive an error when trying to widen the table, due to the duplication of names. To avoid the error, I would create columns for age and one for height.
people %>%
  pivot_wider(names_from = names, values_from = values)
## Warning: Values in `values` are not uniquely identified; output will contain list-cols.
## * Use `values_fn = list(values = list)` to suppress this warning.
## * Use `values_fn = list(values = length)` to identify where the duplicates arise
## * Use `values_fn = list(values = summary_fun)` to summarise duplicates
## # A tibble: 2 x 3
##   name                    age      height
##   <chr>           <list<dbl>> <list<dbl>>
## 1 Phillip Woods           [2]         [1]
## 2 Jessica Cordero         [1]         [1]
#4 Tidy the simple tibble below. Do you need to make it wider or longer? What are the variables?
preg <- tribble(
  ~pregnant, ~male, ~female,
  "yes",     NA,    10,
  "no",      20,    12
)

preg%>%
  pivot_longer(c('male','female'), names_to="sex",values_to=)
## # A tibble: 4 x 3
##   pregnant sex    value
##   <chr>    <chr>  <dbl>
## 1 yes      male      NA
## 2 yes      female    10
## 3 no       male      20
## 4 no       female    12

12.4 Separating and Uniting

12.4.1 Separate

separate() pulls apart one column into multiple columns, by splitting wherever a separator character appears.

table3
## # A tibble: 6 x 3
##   country      year rate             
## * <chr>       <int> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583
# separates rate into cases and population at the forward slash
table3 %>% 
  separate(rate, into = c("cases", "population"))
## # A tibble: 6 x 4
##   country      year cases  population
##   <chr>       <int> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
# could also write as:
#table3 %>% 
  #separate(rate, into = c("cases", "population"), sep = "/")
table3 %>% 
  separate(rate, into = c("cases", "population"), convert = TRUE)
## # A tibble: 6 x 4
##   country      year  cases population
##   <chr>       <int>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
table3 %>% 
  separate(year, into = c("century", "year"), sep = 2)
## # A tibble: 6 x 4
##   country     century year  rate             
##   <chr>       <chr>   <chr> <chr>            
## 1 Afghanistan 19      99    745/19987071     
## 2 Afghanistan 20      00    2666/20595360    
## 3 Brazil      19      99    37737/172006362  
## 4 Brazil      20      00    80488/174504898  
## 5 China       19      99    212258/1272915272
## 6 China       20      00    213766/1280428583

12.4.2 Unite

unite() is the inverse of separate(): it combines multiple columns into a single column

table5 %>% 
  unite(new, century, year)
## # A tibble: 6 x 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 19_99 745/19987071     
## 2 Afghanistan 20_00 2666/20595360    
## 3 Brazil      19_99 37737/172006362  
## 4 Brazil      20_00 80488/174504898  
## 5 China       19_99 212258/1272915272
## 6 China       20_00 213766/1280428583
table5 %>% 
  unite(new, century, year, sep = "")
## # A tibble: 6 x 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 1999  745/19987071     
## 2 Afghanistan 2000  2666/20595360    
## 3 Brazil      1999  37737/172006362  
## 4 Brazil      2000  80488/174504898  
## 5 China       1999  212258/1272915272
## 6 China       2000  213766/1280428583

12.4.3 Exercises

#1 What do the extra and fill arguments do in separate()? Experiment with the various options for the following two toy datasets.

# The extra argument 'g'. The extra tells what to do if there are too many
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% 
  separate(x, c("one", "two", "three"), extra = "drop")
## # A tibble: 3 x 3
##   one   two   three
##   <chr> <chr> <chr>
## 1 a     b     c    
## 2 d     e     f    
## 3 h     i     j
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% 
  separate(x, c("one", "two", "three"), extra = "merge")
## # A tibble: 3 x 3
##   one   two   three
##   <chr> <chr> <chr>
## 1 a     b     c    
## 2 d     e     f,g  
## 3 h     i     j
# The fill argument 'f'. The fill argument tells it what to do if there is something missing.
tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% 
  separate(x, c("one", "two", "three"))
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].
## # A tibble: 3 x 3
##   one   two   three
##   <chr> <chr> <chr>
## 1 a     b     c    
## 2 d     e     <NA> 
## 3 f     g     i
#2 Both unite() and separate() have a remove argument. What does it do? Why would you set it to FALSE?

#The remove argument removes/discards the columns used to either unite or separate. Setting it to FALSE, keeps the original columns.
#3 Compare and contrast separate() and extract(). Why are there three variations of separation (position, separator, groups), but only one unite?

# Separate() splits a column into multiple columns.
# Extract() splits a column into multuple columns as well.
# Separate and Extract both start with a single column broken into several columns.
# Unite takes multiple columns converts it into a single column.

12.5 Missing Values

  • Explicitly, i.e. flagged with NA.
  • Implicitly, i.e. simply not present in the data.
# Missing two values
stocks <- tibble(
  year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
  qtr    = c(   1,    2,    3,    4,    2,    3,    4),
  return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)
stocks %>% 
  pivot_wider(names_from = year, values_from = return)
## # A tibble: 4 x 3
##     qtr `2015` `2016`
##   <dbl>  <dbl>  <dbl>
## 1     1   1.88  NA   
## 2     2   0.59   0.92
## 3     3   0.35   0.17
## 4     4  NA      2.66
stocks %>% 
  pivot_wider(names_from = year, values_from = return) %>% 
  pivot_longer(
    cols = c(`2015`, `2016`), 
    names_to = "year", 
    values_to = "return", 
    values_drop_na = TRUE
  )
## # A tibble: 6 x 3
##     qtr year  return
##   <dbl> <chr>  <dbl>
## 1     1 2015    1.88
## 2     2 2015    0.59
## 3     2 2016    0.92
## 4     3 2015    0.35
## 5     3 2016    0.17
## 6     4 2016    2.66
stocks %>% 
  complete(year, qtr)
## # A tibble: 8 x 3
##    year   qtr return
##   <dbl> <dbl>  <dbl>
## 1  2015     1   1.88
## 2  2015     2   0.59
## 3  2015     3   0.35
## 4  2015     4  NA   
## 5  2016     1  NA   
## 6  2016     2   0.92
## 7  2016     3   0.17
## 8  2016     4   2.66
treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 1,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)
# Fill takes a set of columns where you want missing values to be replaced by the most recent non-missing value (sometimes called last observation carried forward).
treatment %>% 
  fill(person)
## # A tibble: 4 x 3
##   person           treatment response
##   <chr>                <dbl>    <dbl>
## 1 Derrick Whitmore         1        7
## 2 Derrick Whitmore         2       10
## 3 Derrick Whitmore         3        9
## 4 Katherine Burke          1        4

12.5.1 Exercises

#1 Compare and contrast fill arguments to pivot_wider() and complete().

treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 1,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)

# Carries Derrick Whitmore to fill in the NA in person
treatment %>% 
  fill(person)
## # A tibble: 4 x 3
##   person           treatment response
##   <chr>                <dbl>    <dbl>
## 1 Derrick Whitmore         1        7
## 2 Derrick Whitmore         2       10
## 3 Derrick Whitmore         3        9
## 4 Katherine Burke          1        4
# Creates an error there is missing data from the person column. Must do pivot_longer() as well to drop the NA values.

#pivot_wider(names_from = person, values_from = response)

# Finds unique values and creates several NA in person
treatment %>%
  complete(treatment, response)
## # A tibble: 12 x 3
##    treatment response person          
##        <dbl>    <dbl> <chr>           
##  1         1        4 Katherine Burke 
##  2         1        7 Derrick Whitmore
##  3         1        9 <NA>            
##  4         1       10 <NA>            
##  5         2        4 <NA>            
##  6         2        7 <NA>            
##  7         2        9 <NA>            
##  8         2       10 <NA>            
##  9         3        4 <NA>            
## 10         3        7 <NA>            
## 11         3        9 <NA>            
## 12         3       10 <NA>
#2 What does the direction argument to fill() do?

# The direction tells it in which direction it should take the last data to fill in the blanks with.


treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 1,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)

# Carries Katherine Burke to fill in the NA
fill(treatment, person, .direction = "up")
## # A tibble: 4 x 3
##   person           treatment response
##   <chr>                <dbl>    <dbl>
## 1 Derrick Whitmore         1        7
## 2 Katherine Burke          2       10
## 3 Katherine Burke          3        9
## 4 Katherine Burke          1        4

12.6 Case Study

who
## # A tibble: 7,240 x 60
##    country iso2  iso3   year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544
##    <chr>   <chr> <chr> <int>       <int>        <int>        <int>        <int>
##  1 Afghan… AF    AFG    1980          NA           NA           NA           NA
##  2 Afghan… AF    AFG    1981          NA           NA           NA           NA
##  3 Afghan… AF    AFG    1982          NA           NA           NA           NA
##  4 Afghan… AF    AFG    1983          NA           NA           NA           NA
##  5 Afghan… AF    AFG    1984          NA           NA           NA           NA
##  6 Afghan… AF    AFG    1985          NA           NA           NA           NA
##  7 Afghan… AF    AFG    1986          NA           NA           NA           NA
##  8 Afghan… AF    AFG    1987          NA           NA           NA           NA
##  9 Afghan… AF    AFG    1988          NA           NA           NA           NA
## 10 Afghan… AF    AFG    1989          NA           NA           NA           NA
## # … with 7,230 more rows, and 52 more variables: new_sp_m4554 <int>,
## #   new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,
## #   new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
## #   new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
## #   new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
## #   new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
## #   new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
## #   new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,
## #   new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,
## #   new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,
## #   new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
## #   new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,
## #   new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,
## #   new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,
## #   newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,
## #   newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,
## #   newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,
## #   newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>
# Focus on the values that are present and create new colum names.
who1 <- who %>% 
  pivot_longer(
    cols = new_sp_m014:newrel_f65, 
    names_to = "key", 
    values_to = "cases", 
    values_drop_na = TRUE
  )
who1
## # A tibble: 76,046 x 6
##    country     iso2  iso3   year key          cases
##    <chr>       <chr> <chr> <int> <chr>        <int>
##  1 Afghanistan AF    AFG    1997 new_sp_m014      0
##  2 Afghanistan AF    AFG    1997 new_sp_m1524    10
##  3 Afghanistan AF    AFG    1997 new_sp_m2534     6
##  4 Afghanistan AF    AFG    1997 new_sp_m3544     3
##  5 Afghanistan AF    AFG    1997 new_sp_m4554     5
##  6 Afghanistan AF    AFG    1997 new_sp_m5564     2
##  7 Afghanistan AF    AFG    1997 new_sp_m65       0
##  8 Afghanistan AF    AFG    1997 new_sp_f014      5
##  9 Afghanistan AF    AFG    1997 new_sp_f1524    38
## 10 Afghanistan AF    AFG    1997 new_sp_f2534    36
## # … with 76,036 more rows
# Structure of the values
who1 %>% 
  count(key)
## # A tibble: 56 x 2
##    key              n
##    <chr>        <int>
##  1 new_ep_f014   1032
##  2 new_ep_f1524  1021
##  3 new_ep_f2534  1021
##  4 new_ep_f3544  1021
##  5 new_ep_f4554  1017
##  6 new_ep_f5564  1017
##  7 new_ep_f65    1014
##  8 new_ep_m014   1038
##  9 new_ep_m1524  1026
## 10 new_ep_m2534  1020
## # … with 46 more rows
  • First three letters of each column denote new or old cases of TB.
  • Next two letters describe the type of TB:
    • rel = relapse
    • ep = extrapulmonary TB
    • sn = smear negative
    • sp = smear positive
  • Sixth letter gives the sex of the patient. M = male; F = female
  • Numbers represent the age:
    • 014 = 0 – 14 years old
    • 1524 = 15 – 24 years old
    • 2534 = 25 – 34 years old
    • 3544 = 35 – 44 years old
    • 4554 = 45 – 54 years old
    • 5564 = 55 – 64 years old
    • 65 = 65 or older
# Fixes variable names that are inconsistent
who2 <- who1 %>% 
  mutate(names_from = stringr::str_replace(key, "newrel", "new_rel"))
who2
## # A tibble: 76,046 x 7
##    country     iso2  iso3   year key          cases names_from  
##    <chr>       <chr> <chr> <int> <chr>        <int> <chr>       
##  1 Afghanistan AF    AFG    1997 new_sp_m014      0 new_sp_m014 
##  2 Afghanistan AF    AFG    1997 new_sp_m1524    10 new_sp_m1524
##  3 Afghanistan AF    AFG    1997 new_sp_m2534     6 new_sp_m2534
##  4 Afghanistan AF    AFG    1997 new_sp_m3544     3 new_sp_m3544
##  5 Afghanistan AF    AFG    1997 new_sp_m4554     5 new_sp_m4554
##  6 Afghanistan AF    AFG    1997 new_sp_m5564     2 new_sp_m5564
##  7 Afghanistan AF    AFG    1997 new_sp_m65       0 new_sp_m65  
##  8 Afghanistan AF    AFG    1997 new_sp_f014      5 new_sp_f014 
##  9 Afghanistan AF    AFG    1997 new_sp_f1524    38 new_sp_f1524
## 10 Afghanistan AF    AFG    1997 new_sp_f2534    36 new_sp_f2534
## # … with 76,036 more rows
# Separates at each underscore
who3 <- who2 %>% 
  separate(key, c("new", "type", "sexage"), sep = "_")
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2580 rows [243,
## 244, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 903,
## 904, 905, 906, ...].
who3
## # A tibble: 76,046 x 9
##    country     iso2  iso3   year new   type  sexage cases names_from  
##    <chr>       <chr> <chr> <int> <chr> <chr> <chr>  <int> <chr>       
##  1 Afghanistan AF    AFG    1997 new   sp    m014       0 new_sp_m014 
##  2 Afghanistan AF    AFG    1997 new   sp    m1524     10 new_sp_m1524
##  3 Afghanistan AF    AFG    1997 new   sp    m2534      6 new_sp_m2534
##  4 Afghanistan AF    AFG    1997 new   sp    m3544      3 new_sp_m3544
##  5 Afghanistan AF    AFG    1997 new   sp    m4554      5 new_sp_m4554
##  6 Afghanistan AF    AFG    1997 new   sp    m5564      2 new_sp_m5564
##  7 Afghanistan AF    AFG    1997 new   sp    m65        0 new_sp_m65  
##  8 Afghanistan AF    AFG    1997 new   sp    f014       5 new_sp_f014 
##  9 Afghanistan AF    AFG    1997 new   sp    f1524     38 new_sp_f1524
## 10 Afghanistan AF    AFG    1997 new   sp    f2534     36 new_sp_f2534
## # … with 76,036 more rows
who3 %>% 
  count(new)
## # A tibble: 2 x 2
##   new        n
##   <chr>  <int>
## 1 new    73466
## 2 newrel  2580
# Drops columns
who4 <- who3 %>% 
  select(-new, -iso2, -iso3)
# Separates sexage into sex and age
who5 <- who4 %>% 
  separate(sexage, c("sex", "age"), sep = 1)

who5
## # A tibble: 76,046 x 7
##    country      year type  sex   age   cases names_from  
##    <chr>       <int> <chr> <chr> <chr> <int> <chr>       
##  1 Afghanistan  1997 sp    m     014       0 new_sp_m014 
##  2 Afghanistan  1997 sp    m     1524     10 new_sp_m1524
##  3 Afghanistan  1997 sp    m     2534      6 new_sp_m2534
##  4 Afghanistan  1997 sp    m     3544      3 new_sp_m3544
##  5 Afghanistan  1997 sp    m     4554      5 new_sp_m4554
##  6 Afghanistan  1997 sp    m     5564      2 new_sp_m5564
##  7 Afghanistan  1997 sp    m     65        0 new_sp_m65  
##  8 Afghanistan  1997 sp    f     014       5 new_sp_f014 
##  9 Afghanistan  1997 sp    f     1524     38 new_sp_f1524
## 10 Afghanistan  1997 sp    f     2534     36 new_sp_f2534
## # … with 76,036 more rows
# TIDY DATASET
who %>%
  pivot_longer(
    cols = new_sp_m014:newrel_f65, 
    names_to = "key", 
    values_to = "cases", 
    values_drop_na = TRUE
  ) %>% 
  mutate(
    key = stringr::str_replace(key, "newrel", "new_rel")
  ) %>%
  separate(key, c("new", "var", "sexage")) %>% 
  select(-new, -iso2, -iso3) %>% 
  separate(sexage, c("sex", "age"), sep = 1)
## # A tibble: 76,046 x 6
##    country      year var   sex   age   cases
##    <chr>       <int> <chr> <chr> <chr> <int>
##  1 Afghanistan  1997 sp    m     014       0
##  2 Afghanistan  1997 sp    m     1524     10
##  3 Afghanistan  1997 sp    m     2534      6
##  4 Afghanistan  1997 sp    m     3544      3
##  5 Afghanistan  1997 sp    m     4554      5
##  6 Afghanistan  1997 sp    m     5564      2
##  7 Afghanistan  1997 sp    m     65        0
##  8 Afghanistan  1997 sp    f     014       5
##  9 Afghanistan  1997 sp    f     1524     38
## 10 Afghanistan  1997 sp    f     2534     36
## # … with 76,036 more rows

12.6.1 Exercises

#1 In this case study I set values_drop_na = TRUE just to make it easier to check that we had the correct values. Is this reasonable? Think about how missing values are represented in this dataset. Are there implicit missing values? What's the difference between an NA and 0?

# values_drop_na = TRUE may be reasonable depending on no cases of TB were represented. If there are no 0s in the dataset, then NA may represent no cases of TB. Therefore dropping the NA values skews the data. 0 is an integer while NA means a blank in the dataset.
#2 What happens if you neglect the mutate() step?
#(mutate(names_from = stringr::str_replace(key, "newrel", "new_rel")))

# The mutate steps replaces the variable name with new_rel for consistency purposes and to avoid following errors. It allows for the separation at the '_' in the following step to avoid multiple columns.
#3 I claimed that iso2 and iso3 were redundant with country. Confirm this claim.

select(who3, country, iso2, iso3) %>%
  distinct() %>%
  group_by(country) %>%
  filter(n() > 1)
## # A tibble: 0 x 3
## # Groups:   country [0]
## # … with 3 variables: country <chr>, iso2 <chr>, iso3 <chr>
#4 For each country, year, and sex compute the total number of cases of TB.

who5 %>%
  group_by(country, year, sex) %>%
  summarise(cases = sum(cases))
## # A tibble: 6,729 x 4
## # Groups:   country, year [3,484]
##    country      year sex   cases
##    <chr>       <int> <chr> <int>
##  1 Afghanistan  1997 f       102
##  2 Afghanistan  1997 m        26
##  3 Afghanistan  1998 f      1207
##  4 Afghanistan  1998 m       571
##  5 Afghanistan  1999 f       517
##  6 Afghanistan  1999 m       228
##  7 Afghanistan  2000 f      1751
##  8 Afghanistan  2000 m       915
##  9 Afghanistan  2001 f      3062
## 10 Afghanistan  2001 m      1577
## # … with 6,719 more rows

12.7 Non-Tidy Data

Two main reasons to use other data structures: Alternative representations may have substantial performance or space advantages. Specialised fields have evolved their own conventions for storing data that may be quite different to the conventions of tidy data.

library(nycflights13)
library(tidyverse)

Notes: Chapter 5 Data Transformation

nycflights13::flights
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

5.2 Filter rows with filter()

Allows to subset observations based on their values First argument: name of data frame Second argument +: expressions that filter the data frame

filter(flights, month == 1, day == 1)
## # A tibble: 842 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 832 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

The arrow saves the results

jan1 <- filter(flights, month == 1, day == 1)

The arrow and parentheses saves AND prints the results

(dec25 <- filter(flights, month == 12, day == 25))
## # A tibble: 719 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12    25      456            500        -4      649            651
##  2  2013    12    25      524            515         9      805            814
##  3  2013    12    25      542            540         2      832            850
##  4  2013    12    25      546            550        -4     1022           1027
##  5  2013    12    25      556            600        -4      730            745
##  6  2013    12    25      557            600        -3      743            752
##  7  2013    12    25      557            600        -3      818            831
##  8  2013    12    25      559            600        -1      855            856
##  9  2013    12    25      559            600        -1      849            855
## 10  2013    12    25      600            600         0      850            846
## # … with 709 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

5.2.1 Comparisons

R provides the standard suite: >, >=, <, <=, != (not equal), and == (equal)

5.2.2 Logical Operators

# Finds all flights departed in November or December
filter(flights, month == 11 | month == 12)
## # A tibble: 55,403 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    11     1        5           2359         6      352            345
##  2  2013    11     1       35           2250       105      123           2356
##  3  2013    11     1      455            500        -5      641            651
##  4  2013    11     1      539            545        -6      856            827
##  5  2013    11     1      542            545        -3      831            855
##  6  2013    11     1      549            600       -11      912            923
##  7  2013    11     1      550            600       -10      705            659
##  8  2013    11     1      554            600        -6      659            701
##  9  2013    11     1      554            600        -6      826            827
## 10  2013    11     1      554            600        -6      749            751
## # … with 55,393 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# Shorthand for the code above
nov_dec <- filter(flights, month %in% c(11, 12))
# Simplifed complicated subsetting by rem
filter(flights, !(arr_delay > 120 | dep_delay > 120))
## # A tibble: 316,050 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 316,040 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, arr_delay <= 120, dep_delay <= 120)
## # A tibble: 316,050 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 316,040 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

5.2.3 Missing Values

NA represents an unknown value

df <- tibble(x = c(1, NA, 3))
filter(df, x > 1)
## # A tibble: 1 x 1
##       x
##   <dbl>
## 1     3
#this showcases both the FALSE and NA values explicitly
filter(df, is.na(x) | x > 1)
## # A tibble: 2 x 1
##       x
##   <dbl>
## 1    NA
## 2     3

5.2.4 Exercises

# 1.1 Had an arrival delay of two or more hours
filter(flights, arr_delay >= 120)
## # A tibble: 10,200 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      811            630       101     1047            830
##  2  2013     1     1      848           1835       853     1001           1950
##  3  2013     1     1      957            733       144     1056            853
##  4  2013     1     1     1114            900       134     1447           1222
##  5  2013     1     1     1505           1310       115     1638           1431
##  6  2013     1     1     1525           1340       105     1831           1626
##  7  2013     1     1     1549           1445        64     1912           1656
##  8  2013     1     1     1558           1359       119     1718           1515
##  9  2013     1     1     1732           1630        62     2028           1825
## 10  2013     1     1     1803           1620       103     2008           1750
## # … with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# 1.2 Flew to Houston (IAH to HOU)
filter(flights, origin == "IAH", dest == "HOU")
## # A tibble: 0 x 19
## # … with 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# 1.3 Were operated by United, American, or Delta
filter(flights, carrier %in% c("UA","AA","DL"))
## # A tibble: 139,504 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      554            600        -6      812            837
##  5  2013     1     1      554            558        -4      740            728
##  6  2013     1     1      558            600        -2      753            745
##  7  2013     1     1      558            600        -2      924            917
##  8  2013     1     1      558            600        -2      923            937
##  9  2013     1     1      559            600        -1      941            910
## 10  2013     1     1      559            600        -1      854            902
## # … with 139,494 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# 1.4 Departed in Summer (July(7), August(8), September(9))
filter(flights, month %in% 7:9)
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     7     1        1           2029       212      236           2359
##  2  2013     7     1        2           2359         3      344            344
##  3  2013     7     1       29           2245       104      151              1
##  4  2013     7     1       43           2130       193      322             14
##  5  2013     7     1       44           2150       174      300            100
##  6  2013     7     1       46           2051       235      304           2358
##  7  2013     7     1       48           2001       287      308           2305
##  8  2013     7     1       58           2155       183      335             43
##  9  2013     7     1      100           2146       194      327             30
## 10  2013     7     1      100           2245       135      337            135
## # … with 86,316 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# 1.5 Arrived more than 2 hours late, but didn't leave late
filter(flights, arr_delay > 120, dep_delay <= 0)
## # A tibble: 29 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1    27     1419           1420        -1     1754           1550
##  2  2013    10     7     1350           1350         0     1736           1526
##  3  2013    10     7     1357           1359        -2     1858           1654
##  4  2013    10    16      657            700        -3     1258           1056
##  5  2013    11     1      658            700        -2     1329           1015
##  6  2013     3    18     1844           1847        -3       39           2219
##  7  2013     4    17     1635           1640        -5     2049           1845
##  8  2013     4    18      558            600        -2     1149            850
##  9  2013     4    18      655            700        -5     1213            950
## 10  2013     5    22     1827           1830        -3     2217           2010
## # … with 19 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# 1.6 Were delayed by at least an hour, but made up over 30 mins in flight
filter(flights, dep_delay >= 60, dep_delay - arr_delay > 30)
## # A tibble: 1,844 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1     2205           1720       285       46           2040
##  2  2013     1     1     2326           2130       116      131             18
##  3  2013     1     3     1503           1221       162     1803           1555
##  4  2013     1     3     1839           1700        99     2056           1950
##  5  2013     1     3     1850           1745        65     2148           2120
##  6  2013     1     3     1941           1759       102     2246           2139
##  7  2013     1     3     1950           1845        65     2228           2227
##  8  2013     1     3     2015           1915        60     2135           2111
##  9  2013     1     3     2257           2000       177       45           2224
## 10  2013     1     4     1917           1700       137     2135           1950
## # … with 1,834 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# 1.7 Departed between midnight and 6am (inclusive)
filter(flights, dep_time >= 2400 | dep_time <= 600)
## # A tibble: 9,373 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 9,363 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# 2. Another usful dplyr filtering helper is between(). What does it do? Can you use it simplify the code needed to answer the previous challenge?

#The between() function finds the flights that occur between the variables, such as the summer months in problem 1.4. 
filter(flights, between(month, 7,9))
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     7     1        1           2029       212      236           2359
##  2  2013     7     1        2           2359         3      344            344
##  3  2013     7     1       29           2245       104      151              1
##  4  2013     7     1       43           2130       193      322             14
##  5  2013     7     1       44           2150       174      300            100
##  6  2013     7     1       46           2051       235      304           2358
##  7  2013     7     1       48           2001       287      308           2305
##  8  2013     7     1       58           2155       183      335             43
##  9  2013     7     1      100           2146       194      327             30
## 10  2013     7     1      100           2245       135      337            135
## # … with 86,316 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#3 How many flights have a missing dep_time? What other variables are missing? What might these rows represent?

#The arrival time, depart time, air time, and depart delay are all missing. This could imply the flight had been cancelled.
filter(flights, is.na(dep_time))
## # A tibble: 8,255 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1       NA           1630        NA       NA           1815
##  2  2013     1     1       NA           1935        NA       NA           2240
##  3  2013     1     1       NA           1500        NA       NA           1825
##  4  2013     1     1       NA            600        NA       NA            901
##  5  2013     1     2       NA           1540        NA       NA           1747
##  6  2013     1     2       NA           1620        NA       NA           1746
##  7  2013     1     2       NA           1355        NA       NA           1459
##  8  2013     1     2       NA           1420        NA       NA           1644
##  9  2013     1     2       NA           1321        NA       NA           1536
## 10  2013     1     2       NA           1545        NA       NA           1910
## # … with 8,245 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#4 Why is NA ^ 0 not missing? Why is NA | TRUE not missing? Why is FALSE & NA not missing? Can you figure out the general rule?

NA ^ 0
## [1] 1
# Equals 1 always

NA | TRUE
## [1] TRUE
# True is always true

NA & FALSE
## [1] FALSE
# False is always false

#NA * 0 is not equal to 0, because the NA represents infinity and is undefined.

5.3 Arrange Rows with Arrange()

Arrange() is similiar to filter(); but changes the order

arrange(flights, year, month, day)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#orders columns in descending order
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#missing values are sorted at the end
df <- tibble(x = c(5, 2, NA))
arrange(df, x)
## # A tibble: 3 x 1
##       x
##   <dbl>
## 1     2
## 2     5
## 3    NA
arrange(df, desc(x))
## # A tibble: 3 x 1
##       x
##   <dbl>
## 1     5
## 2     2
## 3    NA

5.3.1 Exercises

#1 How could you use arrange() to sort all missing values to the start?
arrange(flights, desc(is.na(dep_time)), dep_time)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1       NA           1630        NA       NA           1815
##  2  2013     1     1       NA           1935        NA       NA           2240
##  3  2013     1     1       NA           1500        NA       NA           1825
##  4  2013     1     1       NA            600        NA       NA            901
##  5  2013     1     2       NA           1540        NA       NA           1747
##  6  2013     1     2       NA           1620        NA       NA           1746
##  7  2013     1     2       NA           1355        NA       NA           1459
##  8  2013     1     2       NA           1420        NA       NA           1644
##  9  2013     1     2       NA           1321        NA       NA           1536
## 10  2013     1     2       NA           1545        NA       NA           1910
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#2 Sort flights to find the most delayed flights. Find the flights that left earliest.

#most delayed
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#left earliest
arrange(flights, dep_delay)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12     7     2040           2123       -43       40           2352
##  2  2013     2     3     2022           2055       -33     2240           2338
##  3  2013    11    10     1408           1440       -32     1549           1559
##  4  2013     1    11     1900           1930       -30     2233           2243
##  5  2013     1    29     1703           1730       -27     1947           1957
##  6  2013     8     9      729            755       -26     1002            955
##  7  2013    10    23     1907           1932       -25     2143           2143
##  8  2013     3    30     2030           2055       -25     2213           2250
##  9  2013     3     2     1431           1455       -24     1601           1631
## 10  2013     5     5      934            958       -24     1225           1309
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#3 Sort flights to find the fastest(highest speed) flights.
arrange(flights, air_time)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1    16     1355           1315        40     1442           1411
##  2  2013     4    13      537            527        10      622            628
##  3  2013    12     6      922            851        31     1021            954
##  4  2013     2     3     2153           2129        24     2247           2224
##  5  2013     2     5     1303           1315       -12     1342           1411
##  6  2013     2    12     2123           2130        -7     2211           2225
##  7  2013     3     2     1450           1500       -10     1547           1608
##  8  2013     3     8     2026           1935        51     2131           2056
##  9  2013     3    18     1456           1329        87     1533           1426
## 10  2013     3    19     2226           2145        41     2305           2246
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#4 Which flights travelled the farthest? Which travlled the shortest?

# travelled the farthest
arrange(flights, desc(distance))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      857            900        -3     1516           1530
##  2  2013     1     2      909            900         9     1525           1530
##  3  2013     1     3      914            900        14     1504           1530
##  4  2013     1     4      900            900         0     1516           1530
##  5  2013     1     5      858            900        -2     1519           1530
##  6  2013     1     6     1019            900        79     1558           1530
##  7  2013     1     7     1042            900       102     1620           1530
##  8  2013     1     8      901            900         1     1504           1530
##  9  2013     1     9      641            900      1301     1242           1530
## 10  2013     1    10      859            900        -1     1449           1530
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# travelled the shortest
arrange(flights, distance)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     7    27       NA            106        NA       NA            245
##  2  2013     1     3     2127           2129        -2     2222           2224
##  3  2013     1     4     1240           1200        40     1333           1306
##  4  2013     1     4     1829           1615       134     1937           1721
##  5  2013     1     4     2128           2129        -1     2218           2224
##  6  2013     1     5     1155           1200        -5     1241           1306
##  7  2013     1     6     2125           2129        -4     2224           2224
##  8  2013     1     7     2124           2129        -5     2212           2224
##  9  2013     1     8     2127           2130        -3     2304           2225
## 10  2013     1     9     2126           2129        -3     2217           2224
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

5.4 Select Columns with Select()

Select() allows you to zoom in on useful subset using operations based on the names of the variables

select(flights, year, month, day)
## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # … with 336,766 more rows
select(flights, year:day)
## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # … with 336,766 more rows
select(flights, -(year:day))
## # A tibble: 336,776 x 16
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
##       <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>  
##  1      517            515         2      830            819        11 UA     
##  2      533            529         4      850            830        20 UA     
##  3      542            540         2      923            850        33 AA     
##  4      544            545        -1     1004           1022       -18 B6     
##  5      554            600        -6      812            837       -25 DL     
##  6      554            558        -4      740            728        12 UA     
##  7      555            600        -5      913            854        19 B6     
##  8      557            600        -3      709            723       -14 EV     
##  9      557            600        -3      838            846        -8 B6     
## 10      558            600        -2      753            745         8 AA     
## # … with 336,766 more rows, and 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
  • starts_with(“abc”): matches names that begin with “abc”.

  • ends_with(“xyz”): matches names that end with “xyz”.

  • contains(“ijk”): matches names that contain “ijk”.

  • matches(“(.)\1”): selects variables that match a regular expression. This one matches any variables that contain repeated characters. You’ll learn more about regular expressions in strings.

  • num_range(“x”, 1:3): matches x1, x2 and x3.

# renames the values
rename(flights, tail_num = tailnum)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tail_num <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# moves to the start of the data
select(flights, time_hour, air_time, everything())
## # A tibble: 336,776 x 19
##    time_hour           air_time  year month   day dep_time sched_dep_time
##    <dttm>                 <dbl> <int> <int> <int>    <int>          <int>
##  1 2013-01-01 05:00:00      227  2013     1     1      517            515
##  2 2013-01-01 05:00:00      227  2013     1     1      533            529
##  3 2013-01-01 05:00:00      160  2013     1     1      542            540
##  4 2013-01-01 05:00:00      183  2013     1     1      544            545
##  5 2013-01-01 06:00:00      116  2013     1     1      554            600
##  6 2013-01-01 05:00:00      150  2013     1     1      554            558
##  7 2013-01-01 06:00:00      158  2013     1     1      555            600
##  8 2013-01-01 06:00:00       53  2013     1     1      557            600
##  9 2013-01-01 06:00:00      140  2013     1     1      557            600
## 10 2013-01-01 06:00:00      138  2013     1     1      558            600
## # … with 336,766 more rows, and 12 more variables: dep_delay <dbl>,
## #   arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
## #   hour <dbl>, minute <dbl>

5.4.1 Exercises

#1 Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.

select(flights, dep_time, dep_delay, arr_time, arr_delay)
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # … with 336,766 more rows
#2 What happens if you include the name of a variable multiple times in a select() call?

select(flights, year, month, month)
## # A tibble: 336,776 x 2
##     year month
##    <int> <int>
##  1  2013     1
##  2  2013     1
##  3  2013     1
##  4  2013     1
##  5  2013     1
##  6  2013     1
##  7  2013     1
##  8  2013     1
##  9  2013     1
## 10  2013     1
## # … with 336,766 more rows
# it ignores the duplication and only incldues the variable once
#3 What does the one_of() function do? Why might it be helpful in conjunction with this vector?

vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, one_of(vars))
## # A tibble: 336,776 x 5
##     year month   day dep_delay arr_delay
##    <int> <int> <int>     <dbl>     <dbl>
##  1  2013     1     1         2        11
##  2  2013     1     1         4        20
##  3  2013     1     1         2        33
##  4  2013     1     1        -1       -18
##  5  2013     1     1        -6       -25
##  6  2013     1     1        -4        12
##  7  2013     1     1        -5        19
##  8  2013     1     1        -3       -14
##  9  2013     1     1        -3        -8
## 10  2013     1     1        -2         8
## # … with 336,766 more rows
#this only works if vars is not a column name in the flights data
#4 Does the result of running the following code surpise you? How do the select helpers deal wit case by default? How can you change that default?

select(flights, contains("TIME"))
## # A tibble: 336,776 x 6
##    dep_time sched_dep_time arr_time sched_arr_time air_time time_hour          
##       <int>          <int>    <int>          <int>    <dbl> <dttm>             
##  1      517            515      830            819      227 2013-01-01 05:00:00
##  2      533            529      850            830      227 2013-01-01 05:00:00
##  3      542            540      923            850      160 2013-01-01 05:00:00
##  4      544            545     1004           1022      183 2013-01-01 05:00:00
##  5      554            600      812            837      116 2013-01-01 06:00:00
##  6      554            558      740            728      150 2013-01-01 05:00:00
##  7      555            600      913            854      158 2013-01-01 06:00:00
##  8      557            600      709            723       53 2013-01-01 06:00:00
##  9      557            600      838            846      140 2013-01-01 06:00:00
## 10      558            600      753            745      138 2013-01-01 06:00:00
## # … with 336,766 more rows
# it is ingores case

select(flights, contains("TIME", ignore.case = FALSE))
## # A tibble: 336,776 x 0
# changes the default behavior

5.5 Add New Variables with Mutate()

Mutate(): Adds new columns that are functions of existing columns; adds columns to the end of your dataset

flights_sml <- select(flights, 
  year:day, 
  ends_with("delay"), 
  distance, 
  air_time
)
mutate(flights_sml,
  gain = dep_delay - arr_delay,
  speed = distance / air_time * 60
)
## # A tibble: 336,776 x 9
##     year month   day dep_delay arr_delay distance air_time  gain speed
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
##  1  2013     1     1         2        11     1400      227    -9  370.
##  2  2013     1     1         4        20     1416      227   -16  374.
##  3  2013     1     1         2        33     1089      160   -31  408.
##  4  2013     1     1        -1       -18     1576      183    17  517.
##  5  2013     1     1        -6       -25      762      116    19  394.
##  6  2013     1     1        -4        12      719      150   -16  288.
##  7  2013     1     1        -5        19     1065      158   -24  404.
##  8  2013     1     1        -3       -14      229       53    11  259.
##  9  2013     1     1        -3        -8      944      140     5  405.
## 10  2013     1     1        -2         8      733      138   -10  319.
## # … with 336,766 more rows
# can refer to the columns that were just created
mutate(flights_sml,
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)
## # A tibble: 336,776 x 10
##     year month   day dep_delay arr_delay distance air_time  gain hours
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
##  1  2013     1     1         2        11     1400      227    -9 3.78 
##  2  2013     1     1         4        20     1416      227   -16 3.78 
##  3  2013     1     1         2        33     1089      160   -31 2.67 
##  4  2013     1     1        -1       -18     1576      183    17 3.05 
##  5  2013     1     1        -6       -25      762      116    19 1.93 
##  6  2013     1     1        -4        12      719      150   -16 2.5  
##  7  2013     1     1        -5        19     1065      158   -24 2.63 
##  8  2013     1     1        -3       -14      229       53    11 0.883
##  9  2013     1     1        -3        -8      944      140     5 2.33 
## 10  2013     1     1        -2         8      733      138   -10 2.3  
## # … with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>
# only want to keep the new variables, use the transmute()
transmute(flights,
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)
## # A tibble: 336,776 x 3
##     gain hours gain_per_hour
##    <dbl> <dbl>         <dbl>
##  1    -9 3.78          -2.38
##  2   -16 3.78          -4.23
##  3   -31 2.67         -11.6 
##  4    17 3.05           5.57
##  5    19 1.93           9.83
##  6   -16 2.5           -6.4 
##  7   -24 2.63          -9.11
##  8    11 0.883         12.5 
##  9     5 2.33           2.14
## 10   -10 2.3           -4.35
## # … with 336,766 more rows

5.5.1 Useful Creation Functions

  • Arithmetic operators: +, -, *, /, ^
    • Arithmetic operators are also useful in conjunction with the aggregate functions
    • Not affected by group_by()
  • Modular arithmetic: %/% (integer division) and %% (remainder), where x == y * (x %/% y) + (x %% y)
    • Breaks integers into pieces
    • Not affected by group_by()
# example
transmute(flights,
  dep_time,
  hour = dep_time %/% 100,
  minute = dep_time %% 100
)
## # A tibble: 336,776 x 3
##    dep_time  hour minute
##       <int> <dbl>  <dbl>
##  1      517     5     17
##  2      533     5     33
##  3      542     5     42
##  4      544     5     44
##  5      554     5     54
##  6      554     5     54
##  7      555     5     55
##  8      557     5     57
##  9      557     5     57
## 10      558     5     58
## # … with 336,766 more rows
  • Logs: log(), log2(), log10().
    • log2(): easy to interpret
    • Not affected by group_by()
  • Offsets: lead() and lag() allow you to refer to leading or lagging values
    • Respect the groupings of group_by()
  • Cumulative and rolling aggregates: R provides functions for running sums, products, mins and maxes: cumsum(), cumprod(), cummin(), cummax(); and dplyr provides cummean() for cumulative means
    • Calculate the total within each group
  • Logical comparisons, <, <=, >, >=, !=, and ==
    • Not affected by group_by()
  • Ranking: there are a number of ranking functions, but you should start with min_rank()
    • Default gives smallest values; use desc(x) to give the largest values
    • Follow within in each group when used with group_by()

5.5.2 Exercises

#1 Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they're not really continuous numbers. Convert them to a more convenient represenation of number of minutes since midnight.


#divide dep_time by 100 to get hours since midnight, then mulitply by 60 to get minutes. Get the remainder of the dep_time and add the reaminding minutes. The 1440 represents midnight by converting it to 0.
flights_times <- mutate(flights,dep_time_mins = (dep_time %/% 100 * 60 + dep_time %% 100) %% 1440,sched_dep_time_mins = (sched_dep_time %/% 100 * 60 + sched_dep_time %% 100) %% 1440
)
# view columns
select(
  flights_times, dep_time, dep_time_mins, sched_dep_time,
  sched_dep_time_mins
)
## # A tibble: 336,776 x 4
##    dep_time dep_time_mins sched_dep_time sched_dep_time_mins
##       <int>         <dbl>          <int>               <dbl>
##  1      517           317            515                 315
##  2      533           333            529                 329
##  3      542           342            540                 340
##  4      544           344            545                 345
##  5      554           354            600                 360
##  6      554           354            558                 358
##  7      555           355            600                 360
##  8      557           357            600                 360
##  9      557           357            600                 360
## 10      558           358            600                 360
## # … with 336,766 more rows
#2 Compare air_time and arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it?

# I would think that air_time = arr_time-dep_time.
select(flights, air_time, arr_time, dep_time)
## # A tibble: 336,776 x 3
##    air_time arr_time dep_time
##       <dbl>    <int>    <int>
##  1      227      830      517
##  2      227      850      533
##  3      160      923      542
##  4      183     1004      544
##  5      116      812      554
##  6      150      740      554
##  7      158      913      555
##  8       53      709      557
##  9      140      838      557
## 10      138      753      558
## # … with 336,766 more rows
# The arr_time and dep_time need to be converted to minutes from midnight like #1 to get a true comparison.
#3 Compare dep_time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related?

# I would expect the dep_delay = dep_time - sched_dep_time
select(flights, dep_time, sched_dep_time, dep_delay)
## # A tibble: 336,776 x 3
##    dep_time sched_dep_time dep_delay
##       <int>          <int>     <dbl>
##  1      517            515         2
##  2      533            529         4
##  3      542            540         2
##  4      544            545        -1
##  5      554            600        -6
##  6      554            558        -4
##  7      555            600        -5
##  8      557            600        -3
##  9      557            600        -3
## 10      558            600        -2
## # … with 336,766 more rows
# To get a true comparison, then the numbers would need to be converted to minutes from midnight. 
#4 Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().

flights %>%
  filter(min_rank(-(dep_delay)) %in% 1:10)
## # A tibble: 10 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     1    10     1121           1635      1126     1239           1810
##  3  2013    12     5      756           1700       896     1058           2020
##  4  2013     3    17     2321            810       911      135           1020
##  5  2013     4    10     1100           1900       960     1342           2211
##  6  2013     6    15     1432           1935      1137     1607           2120
##  7  2013     6    27      959           1900       899     1236           2226
##  8  2013     7    22      845           1600      1005     1044           1815
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013     9    20     1139           1845      1014     1457           2210
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
#4 What does 1:3 + 1:10 return? Why?
1:3 + 1:10
## Warning in 1:3 + 1:10: longer object length is not a multiple of shorter object
## length
##  [1]  2  4  6  5  7  9  8 10 12 11
# Returns a warning; could indicate a bug
#6 What trignometric function does R provide?

#sin(), cos(), tan()

5.6 Grouped Summaries with Summarise()

# Collapses data frame to a single row
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 1 x 1
##   delay
##   <dbl>
## 1  12.6
# Summarise() is more useful when paired with group_by()
by_day <- group_by (flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day delay
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # … with 355 more rows

5.6.1 Combining Mulitple Operations with the Pipe

by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count > 20, dest != "HNL")
# Same problem using the pipe %>%
# Focuses on the transformations, not what's being transformed
delays <- flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  filter(count > 20, dest != "HNL")

5.6.2 Missing Values

# The na.rm produces a lot of missing values
flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay))
## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1    NA
##  2  2013     1     2    NA
##  3  2013     1     3    NA
##  4  2013     1     4    NA
##  5  2013     1     5    NA
##  6  2013     1     6    NA
##  7  2013     1     7    NA
##  8  2013     1     8    NA
##  9  2013     1     9    NA
## 10  2013     1    10    NA
## # … with 355 more rows
# Removes the missing values
flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay, na.rm = TRUE))
## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # … with 355 more rows
# Takes into account of the cancelled flights
not_cancelled <- flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay))
## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.4 
##  2  2013     1     2 13.7 
##  3  2013     1     3 10.9 
##  4  2013     1     4  8.97
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.56
##  9  2013     1     9  2.30
## 10  2013     1    10  2.84
## # … with 355 more rows

5.6.3 Counts

Good idea to include: count(n()) or (sum(!is.na(x))) to avoid conclusions based on small amounts of data

delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay)
  )
delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay, na.rm = TRUE),
    n = n()
  )

5.6.4 Useful Summary Functions

  • Measures of location: mean(x), median(x)
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    avg_delay1 = mean(arr_delay),
    avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay
  )
## # A tibble: 365 x 5
## # Groups:   year, month [12]
##     year month   day avg_delay1 avg_delay2
##    <int> <int> <int>      <dbl>      <dbl>
##  1  2013     1     1     12.7         32.5
##  2  2013     1     2     12.7         32.0
##  3  2013     1     3      5.73        27.7
##  4  2013     1     4     -1.93        28.3
##  5  2013     1     5     -1.53        22.6
##  6  2013     1     6      4.24        24.4
##  7  2013     1     7     -4.95        27.8
##  8  2013     1     8     -3.23        20.8
##  9  2013     1     9     -0.264       25.6
## 10  2013     1    10     -5.90        27.3
## # … with 355 more rows
  • Measures of spread: sd(x), IQR(x), mad(x)
not_cancelled %>% 
  group_by(dest) %>% 
  summarise(distance_sd = sd(distance)) %>% 
  arrange(desc(distance_sd))
## # A tibble: 104 x 2
##    dest  distance_sd
##    <chr>       <dbl>
##  1 EGE         10.5 
##  2 SAN         10.4 
##  3 SFO         10.2 
##  4 HNL         10.0 
##  5 SEA          9.98
##  6 LAS          9.91
##  7 PDX          9.87
##  8 PHX          9.86
##  9 LAX          9.66
## 10 IND          9.46
## # … with 94 more rows
  • Measures of rank: min(x), quantile(x, 0.25), max(x)
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first = min(dep_time),
    last = max(dep_time)
  )
## # A tibble: 365 x 5
## # Groups:   year, month [12]
##     year month   day first  last
##    <int> <int> <int> <int> <int>
##  1  2013     1     1   517  2356
##  2  2013     1     2    42  2354
##  3  2013     1     3    32  2349
##  4  2013     1     4    25  2358
##  5  2013     1     5    14  2357
##  6  2013     1     6    16  2355
##  7  2013     1     7    49  2359
##  8  2013     1     8   454  2351
##  9  2013     1     9     2  2252
## 10  2013     1    10     3  2320
## # … with 355 more rows
  • Measures of position: first(x), nth(x,2), last(x)
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first_dep = first(dep_time), 
    last_dep = last(dep_time)
  )
## # A tibble: 365 x 5
## # Groups:   year, month [12]
##     year month   day first_dep last_dep
##    <int> <int> <int>     <int>    <int>
##  1  2013     1     1       517     2356
##  2  2013     1     2        42     2354
##  3  2013     1     3        32     2349
##  4  2013     1     4        25     2358
##  5  2013     1     5        14     2357
##  6  2013     1     6        16     2355
##  7  2013     1     7        49     2359
##  8  2013     1     8       454     2351
##  9  2013     1     9         2     2252
## 10  2013     1    10         3     2320
## # … with 355 more rows
# Complementary to measures of position
not_cancelled %>% 
  group_by(year, month, day) %>% 
  mutate(r = min_rank(desc(dep_time))) %>% 
  filter(r %in% range(r))
## # A tibble: 770 x 20
## # Groups:   year, month, day [365]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1     2356           2359        -3      425            437
##  3  2013     1     2       42           2359        43      518            442
##  4  2013     1     2     2354           2359        -5      413            437
##  5  2013     1     3       32           2359        33      504            442
##  6  2013     1     3     2349           2359       -10      434            445
##  7  2013     1     4       25           2359        26      505            442
##  8  2013     1     4     2358           2359        -1      429            437
##  9  2013     1     4     2358           2359        -1      436            445
## 10  2013     1     5       14           2359        15      503            445
## # … with 760 more rows, and 12 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, r <int>
  • Counts: n(), sum(!is.na(x)), n_distinct(x)
not_cancelled %>% 
  group_by(dest) %>% 
  summarise(carriers = n_distinct(carrier)) %>% 
  arrange(desc(carriers))
## # A tibble: 104 x 2
##    dest  carriers
##    <chr>    <int>
##  1 ATL          7
##  2 BOS          7
##  3 CLT          7
##  4 ORD          7
##  5 TPA          7
##  6 AUS          6
##  7 DCA          6
##  8 DTW          6
##  9 IAD          6
## 10 MSP          6
## # … with 94 more rows
not_cancelled %>% 
  count(dest)
## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # … with 94 more rows
# Total number of miles a plane flew
not_cancelled %>% 
  count(tailnum, wt = distance)
## # A tibble: 4,037 x 2
##    tailnum      n
##    <chr>    <dbl>
##  1 D942DN    3418
##  2 N0EGMQ  239143
##  3 N10156  109664
##  4 N102UW   25722
##  5 N103US   24619
##  6 N104UW   24616
##  7 N10575  139903
##  8 N105UW   23618
##  9 N107US   21677
## 10 N108UW   32070
## # … with 4,027 more rows
# How many flights left before 5am? (these usually indicate delayed
# flights from the previous day)
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(n_early = sum(dep_time < 500))
## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day n_early
##    <int> <int> <int>   <int>
##  1  2013     1     1       0
##  2  2013     1     2       3
##  3  2013     1     3       4
##  4  2013     1     4       3
##  5  2013     1     5       3
##  6  2013     1     6       2
##  7  2013     1     7       2
##  8  2013     1     8       1
##  9  2013     1     9       3
## 10  2013     1    10       3
## # … with 355 more rows
# What proportion of flights are delayed by more than an hour?
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(hour_prop = mean(arr_delay > 60))
## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day hour_prop
##    <int> <int> <int>     <dbl>
##  1  2013     1     1    0.0722
##  2  2013     1     2    0.0851
##  3  2013     1     3    0.0567
##  4  2013     1     4    0.0396
##  5  2013     1     5    0.0349
##  6  2013     1     6    0.0470
##  7  2013     1     7    0.0333
##  8  2013     1     8    0.0213
##  9  2013     1     9    0.0202
## 10  2013     1    10    0.0183
## # … with 355 more rows

5.6.5 Grouping by Multiple Variables

daily <- group_by(flights, year, month, day)
(per_day   <- summarise(daily, flights = n()))
## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day flights
##    <int> <int> <int>   <int>
##  1  2013     1     1     842
##  2  2013     1     2     943
##  3  2013     1     3     914
##  4  2013     1     4     915
##  5  2013     1     5     720
##  6  2013     1     6     832
##  7  2013     1     7     933
##  8  2013     1     8     899
##  9  2013     1     9     902
## 10  2013     1    10     932
## # … with 355 more rows
(per_month <- summarise(per_day, flights = sum(flights)))
## # A tibble: 12 x 3
## # Groups:   year [1]
##     year month flights
##    <int> <int>   <int>
##  1  2013     1   27004
##  2  2013     2   24951
##  3  2013     3   28834
##  4  2013     4   28330
##  5  2013     5   28796
##  6  2013     6   28243
##  7  2013     7   29425
##  8  2013     8   29327
##  9  2013     9   27574
## 10  2013    10   28889
## 11  2013    11   27268
## 12  2013    12   28135
(per_year  <- summarise(per_month, flights = sum(flights)))
## # A tibble: 1 x 2
##    year flights
##   <int>   <int>
## 1  2013  336776

5.6.6 Ungrouping

daily %>% 
  ungroup() %>%             # no longer grouped by date
  summarise(flights = n())  # all flights
## # A tibble: 1 x 1
##   flights
##     <int>
## 1  336776

5.6.7 Exercises

#1 Brainstorm at least 5 different ways to assess the typical dealy characteristics of a group of flights.
delay_char <-
  flights %>%
  group_by(flight) %>%
  summarise(n = n(),
            fif_early = mean(arr_delay == -15, na.rm = TRUE),
            fif_late = mean(arr_delay == 15, na.rm = TRUE),
            ten_always = mean(arr_delay == 10, na.rm = TRUE),
            thirty_early = mean(arr_delay == -30, na.rm = TRUE),
            thirty_late = mean(arr_delay == 30, na.rm = TRUE),
            per_on_time = mean(arr_delay == 0, na.rm = TRUE),
            two_hours = mean(arr_delay > 120, na.rm = TRUE))

#A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time.

delay_char %>%
  filter(fif_early == 0.5, fif_late == 0.5)
## # A tibble: 0 x 9
## # … with 9 variables: flight <int>, n <int>, fif_early <dbl>, fif_late <dbl>,
## #   ten_always <dbl>, thirty_early <dbl>, thirty_late <dbl>, per_on_time <dbl>,
## #   two_hours <dbl>
#A flight is always 10 minutes late.

delay_char %>%
  filter(ten_always == 1)
## # A tibble: 5 x 9
##   flight     n fif_early fif_late ten_always thirty_early thirty_late
##    <int> <int>     <dbl>    <dbl>      <dbl>        <dbl>       <dbl>
## 1   2254     1         0        0          1            0           0
## 2   3656     1         0        0          1            0           0
## 3   3785     2         0        0          1            0           0
## 4   3880     1         0        0          1            0           0
## 5   5854     1         0        0          1            0           0
## # … with 2 more variables: per_on_time <dbl>, two_hours <dbl>
#A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time.

delay_char %>%
  filter(thirty_early == 0.5 & thirty_late == 0.5)
## # A tibble: 0 x 9
## # … with 9 variables: flight <int>, n <int>, fif_early <dbl>, fif_late <dbl>,
## #   ten_always <dbl>, thirty_early <dbl>, thirty_late <dbl>, per_on_time <dbl>,
## #   two_hours <dbl>
#99% of the time a flight is on time. 1% of the time it’s 2 hours late.

delay_char %>%
  filter(per_on_time == 0.99 & two_hours == 0.01)
## # A tibble: 0 x 9
## # … with 9 variables: flight <int>, n <int>, fif_early <dbl>, fif_late <dbl>,
## #   ten_always <dbl>, thirty_early <dbl>, thirty_late <dbl>, per_on_time <dbl>,
## #   two_hours <dbl>
# Which is more important: arrival delay or departure delay?
# Arrival delay is mort important because it can affect the following flights/connector flights. 
#2 Come up with another approach that will give you the same output as not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum, wt = distance) (without using count()).

not_cancelled %>% count(dest)
## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # … with 94 more rows
not_cancelled %>%
  group_by(dest) %>%
  summarise(n = length(dest))
## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # … with 94 more rows
#3 Our defintion of cancelled flights (is.na(dep_delay)) | is.na(arr_delay)) is slightly suboptimal. Why? Which is the most important column?

#The most important column is the arr_delay. The definition is suboptimal because if a flight never left, then it was cancelled.
#4 Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay?
flights%>% 
  group_by(day) %>%
  summarise(cancelled = mean(is.na(dep_delay)))
## # A tibble: 31 x 2
##      day cancelled
##    <int>     <dbl>
##  1     1   0.0223 
##  2     2   0.0231 
##  3     3   0.00972
##  4     4   0.00741
##  5     5   0.0208 
##  6     6   0.0268 
##  7     7   0.0289 
##  8     8   0.0817 
##  9     9   0.0546 
## 10    10   0.0477 
## # … with 21 more rows
# I'm not sure what the pattern is.
#5 Which carrier has the worst delays? Challenge: Can you disentangle the effects of bad airports vs. bad carriers? Why/why not?
flights %>%
  group_by(carrier) %>%
  summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(arr_delay))
## # A tibble: 16 x 2
##    carrier arr_delay
##    <chr>       <dbl>
##  1 F9         21.9  
##  2 FL         20.1  
##  3 EV         15.8  
##  4 YV         15.6  
##  5 OO         11.9  
##  6 MQ         10.8  
##  7 WN          9.65 
##  8 B6          9.46 
##  9 9E          7.38 
## 10 UA          3.56 
## 11 US          2.13 
## 12 VX          1.76 
## 13 DL          1.64 
## 14 AA          0.364
## 15 HA         -6.92 
## 16 AS         -9.93
# I'm confused by the challenge question. F9 is the worst carrier. Would you look at the airports that it goes to?
#6 What does the sort argument to count() do. When might you use it?

#Sorts the results in the order of 'n'; Sorts based on the count

5.7 Grouped Mutates (and Filters)

flights_sml %>% 
  group_by(year, month, day) %>%
  filter(rank(desc(arr_delay)) < 10)
## # A tibble: 3,306 x 7
## # Groups:   year, month, day [365]
##     year month   day dep_delay arr_delay distance air_time
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
##  1  2013     1     1       853       851      184       41
##  2  2013     1     1       290       338     1134      213
##  3  2013     1     1       260       263      266       46
##  4  2013     1     1       157       174      213       60
##  5  2013     1     1       216       222      708      121
##  6  2013     1     1       255       250      589      115
##  7  2013     1     1       285       246     1085      146
##  8  2013     1     1       192       191      199       44
##  9  2013     1     1       379       456     1092      222
## 10  2013     1     2       224       207      550       94
## # … with 3,296 more rows
popular_dests <- flights %>% 
  group_by(dest) %>% 
  filter(n() > 365)
popular_dests
## # A tibble: 332,577 x 19
## # Groups:   dest [77]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 332,567 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
popular_dests %>% 
  filter(arr_delay > 0) %>% 
  mutate(prop_delay = arr_delay / sum(arr_delay)) %>% 
  select(year:day, dest, arr_delay, prop_delay)
## # A tibble: 131,106 x 6
## # Groups:   dest [77]
##     year month   day dest  arr_delay prop_delay
##    <int> <int> <int> <chr>     <dbl>      <dbl>
##  1  2013     1     1 IAH          11  0.000111 
##  2  2013     1     1 IAH          20  0.000201 
##  3  2013     1     1 MIA          33  0.000235 
##  4  2013     1     1 ORD          12  0.0000424
##  5  2013     1     1 FLL          19  0.0000938
##  6  2013     1     1 ORD           8  0.0000283
##  7  2013     1     1 LAX           7  0.0000344
##  8  2013     1     1 DFW          31  0.000282 
##  9  2013     1     1 ATL          12  0.0000400
## 10  2013     1     1 DTW          16  0.000116 
## # … with 131,096 more rows

5.7.1 Exercises

#1 Refer back to the lists of useful mutate and filtering functions. Describe how each operation changes when you combine it with grouping.

# See #### 5.5.1 Useful Creation Functions for notes
#2 Which plane(tailnum) has the worst on-time record?

flights %>%
  filter(!is.na(arr_delay)) %>%
  group_by(tailnum) %>%
  summarise(prop_time = sum(arr_delay <= 30)/n(),
            mean_arr = mean(arr_delay, na.rm = T),
            fl = n()) %>%
  arrange(desc(prop_time))
## # A tibble: 4,037 x 4
##    tailnum prop_time mean_arr    fl
##    <chr>       <dbl>    <dbl> <int>
##  1 N103US          1    -6.93    46
##  2 N1200K          1    -9.38    21
##  3 N121DE          1    15        2
##  4 N137DL          1    -5        1
##  5 N143DA          1    24        1
##  6 N14628          1    -6        1
##  7 N14629          1   -16.2      4
##  8 N1607B          1   -16        3
##  9 N1608           1   -11.3      3
## 10 N1610D          1   -14.5      2
## # … with 4,027 more rows
#3 What time of day should you fly if you want to avoid delays as much as possible?

# Grouped by hour to get the time of day
flights %>%
  group_by(hour) %>%
  summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(arr_delay)
## # A tibble: 20 x 2
##     hour arr_delay
##    <dbl>     <dbl>
##  1     7    -5.30 
##  2     5    -4.80 
##  3     6    -3.38 
##  4     9    -1.45 
##  5     8    -1.11 
##  6    10     0.954
##  7    11     1.48 
##  8    12     3.49 
##  9    13     6.54 
## 10    14     9.20 
## 11    23    11.8  
## 12    15    12.3  
## 13    16    12.6  
## 14    18    14.8  
## 15    22    16.0  
## 16    17    16.0  
## 17    19    16.7  
## 18    20    16.7  
## 19    21    18.4  
## 20     1   NaN
# Morning flights have less delays
#4 For each destination, compute the total minutes of delay. For each flight, compute the proportion of the delay for its destination.
flights %>%
  group_by(dest) %>%
  filter(!is.na(dep_delay)) %>%
  summarise(total=sum(dep_delay))
## # A tibble: 104 x 2
##    dest   total
##    <chr>  <dbl>
##  1 ABQ     3490
##  2 ACK     1711
##  3 ALB     9897
##  4 ANC      103
##  5 ATL   211391
##  6 AUS    31496
##  7 AVL     2154
##  8 BDL     7301
##  9 BGR     7011
## 10 BHM     8077
## # … with 94 more rows
flights%>%
  group_by(dest, tailnum)%>%
  filter(!is.na(dep_delay)) %>%
  summarise(avg=mean(dep_delay))
## # A tibble: 44,218 x 3
## # Groups:   dest [104]
##    dest  tailnum   avg
##    <chr> <chr>   <dbl>
##  1 ABQ   N503JB   10  
##  2 ABQ   N504JB    9.5
##  3 ABQ   N505JB    0  
##  4 ABQ   N506JB   15  
##  5 ABQ   N507JB    3  
##  6 ABQ   N508JB   -3.5
##  7 ABQ   N509JB    4.5
##  8 ABQ   N510JB    7.5
##  9 ABQ   N517JB   -2  
## 10 ABQ   N519JB   53  
## # … with 44,208 more rows
#5 Delays are typically temporally correlated: even once the problem that caused the initial delay has been resolved, later flights are delayed to allow earlier flights to leave. Using lag(), explore how the delay of a flight is related to the delay of the immediately preceding flight.
flights%>%
  group_by(origin)%>%
  arrange(origin,month,day,dep_time)%>%
  mutate(dep_delay_lag = lag(dep_delay)) %>%
  filter(!is.na(dep_delay), !is.na(dep_delay_lag))
## # A tibble: 327,649 x 20
## # Groups:   origin [3]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      554            558        -4      740            728
##  2  2013     1     1      555            600        -5      913            854
##  3  2013     1     1      558            600        -2      923            937
##  4  2013     1     1      559            600        -1      854            902
##  5  2013     1     1      601            600         1      844            850
##  6  2013     1     1      606            610        -4      858            910
##  7  2013     1     1      607            607         0      858            915
##  8  2013     1     1      608            600         8      807            735
##  9  2013     1     1      615            615         0      833            842
## 10  2013     1     1      622            630        -8     1017           1014
## # … with 327,639 more rows, and 12 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   dep_delay_lag <dbl>
#6 Look at each destination. Can you find flights that are suspiciously fast? Compute the air time of a flight relative to the shortest flight to that destination. Which flights were most delayed in the air?

flights %>%
  group_by(dest) %>%
  mutate(shortest = air_time - min(air_time, na.rm = TRUE)) %>%
  top_n(1, air_time) %>%
  arrange(desc(air_time)) %>%
  select(tailnum, sched_dep_time, sched_arr_time, shortest)
## Warning in min(air_time, na.rm = TRUE): no non-missing arguments to min;
## returning Inf
## Adding missing grouping variables: `dest`
## # A tibble: 112 x 5
## # Groups:   dest [104]
##    dest  tailnum sched_dep_time sched_arr_time shortest
##    <chr> <chr>            <int>          <int>    <dbl>
##  1 HNL   N77066            1335           1836      133
##  2 SFO   N703TW            1730           2110      195
##  3 LAX   N178DN            1815           2146      165
##  4 ANC   N572UA            1615           1953       46
##  5 SAN   N794JB            1620           1934      134
##  6 SNA   N16709            1819           2137      131
##  7 BUR   N624JB            1730           2046      110
##  8 LAS   N852UA            1729           2013      143
##  9 SJC   N632JB            1830           2205       91
## 10 SEA   N17245            1727           2040      119
## # … with 102 more rows
#7 Find all the destinations that are flown by at least two carriers. Use that information to rank the carriers.

flights %>%
  group_by(dest) %>%
  filter(n_distinct(carrier) >= 2) %>%
  group_by(carrier) %>%
  summarise(n = n_distinct(dest)) %>%
  arrange(desc(n))
## # A tibble: 16 x 2
##    carrier     n
##    <chr>   <int>
##  1 EV         51
##  2 9E         48
##  3 UA         42
##  4 DL         39
##  5 B6         35
##  6 AA         19
##  7 MQ         19
##  8 WN         10
##  9 OO          5
## 10 US          5
## 11 VX          4
## 12 YV          3
## 13 FL          2
## 14 AS          1
## 15 F9          1
## 16 HA          1
#8 For each plan, count the number of flights before the first delay of greater than 1 hour.

flights %>%
   filter(!is.na(dep_delay)) %>%
   group_by(tailnum) %>%
   mutate(total = sum(dep_delay > 60))
## # A tibble: 328,521 x 20
## # Groups:   tailnum [4,037]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 328,511 more rows, and 12 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   total <int>