Datasets can be transformed with dplyr, which can be loaded with the tidyverse package or by itself.

Filtering


Data can be subsetted with filter(). The following examples use the flights dataset. The code makes use of the pipe symbol, %>%. This can be thought of as a replacement for “then”.

> library(nycflights13) #contains flights dataset
> library(tidyverse) #contains dplyr
> names(flights) #column names
 [1] "year"           "month"          "day"            "dep_time"      
 [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
 [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
[13] "origin"         "dest"           "air_time"       "distance"      
[17] "hour"           "minute"         "time_hour"     
> #flights dataset THEN filter for Jan 1st
> #wrapping the whole statement in parentheses will 
> #display it as well as save it as Jan1
> #otherwise it will only save
> (Jan1 <-flights %>% filter(month==1,day==1))
# A tibble: 842 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ... with 832 more rows, and 11 more variables: arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Comparisons

The typical comparison operators can be used, >,>=,<,<=,!=(not equal),==(equal).

> #Days 7 and higher
> (Jan7plus <-flights %>% filter(month==1,day>=7))
# A tibble: 21,838 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     7       49           2359        50      531            444
 2  2013     1     7      454            500        -6      637            648
 3  2013     1     7      523            525        -2      758            820
 4  2013     1     7      531            540        -9      827            850
 5  2013     1     7      536            540        -4     1020           1017
 6  2013     1     7      544            530        14      822            829
 7  2013     1     7      545            600       -15      646            709
 8  2013     1     7      552            600        -8      843            904
 9  2013     1     7      554            600        -6      708            715
10  2013     1     7      555            600        -5      741            801
# ... with 21,828 more rows, and 11 more variables: arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Logical Operators

You can also use the logical operators & for "and", | for "or" and ! for "not".

> #November or December
> flights %>% filter(month==11 | month==12)
# A tibble: 55,403 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013    11     1        5           2359         6      352            345
 2  2013    11     1       35           2250       105      123           2356
 3  2013    11     1      455            500        -5      641            651
 4  2013    11     1      539            545        -6      856            827
 5  2013    11     1      542            545        -3      831            855
 6  2013    11     1      549            600       -11      912            923
 7  2013    11     1      550            600       -10      705            659
 8  2013    11     1      554            600        -6      659            701
 9  2013    11     1      554            600        -6      826            827
10  2013    11     1      554            600        -6      749            751
# ... with 55,393 more rows, and 11 more variables: arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

We can achieve the same results with this alternate method:

> # x %in% y format
> #selects every row where x is equal to a value in y
> (nov_dec <-flights %>% filter(month %in% c(11,12)))
# A tibble: 55,403 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013    11     1        5           2359         6      352            345
 2  2013    11     1       35           2250       105      123           2356
 3  2013    11     1      455            500        -5      641            651
 4  2013    11     1      539            545        -6      856            827
 5  2013    11     1      542            545        -3      831            855
 6  2013    11     1      549            600       -11      912            923
 7  2013    11     1      550            600       -10      705            659
 8  2013    11     1      554            600        -6      659            701
 9  2013    11     1      554            600        -6      826            827
10  2013    11     1      554            600        -6      749            751
# ... with 55,393 more rows, and 11 more variables: arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

De Morgan’s law:

  • !(x & y) = !x | !y
  • !(x | y) = !x & !y

To find flights that weren’t delayed by more than 2 hours either of the following could be used:

> flights %>% filter(!(arr_delay>120 | dep_delay>120 ))
> flights %>% filter(arr_delay<=120 & dep_delay<=120 )

Missing Values

filter() will exlcude NA values.

> #filter excludes false and NA values
> samp<- tibble(x=c(5,NA,8))
> samp %>% filter(x>5)
# A tibble: 1 x 1
      x
  <dbl>
1     8
> #must ask for NA values if they are wanted
> samp %>% filter(is.na(x)|x>5)
# A tibble: 2 x 1
      x
  <dbl>
1    NA
2     8

Filtering - Exercises

  1. Find all flights that:
  1. Had an arrival delay of two or more hours:
> library(tidyverse)
> library(nycflights13)
> 
> #arr_delay in minutes
> flights %>% filter(arr_delay>=120) %>% 
+   select(year, month, day, flight, arr_delay) %>% 
+   arrange(desc(arr_delay))
# A tibble: 10,200 x 5
    year month   day flight arr_delay
   <int> <int> <int>  <int>     <dbl>
 1  2013     1     9     51      1272
 2  2013     6    15   3535      1127
 3  2013     1    10   3695      1109
 4  2013     9    20    177      1007
 5  2013     7    22   3075       989
 6  2013     4    10   2391       931
 7  2013     3    17   2119       915
 8  2013     7    22   2047       895
 9  2013    12     5    172       878
10  2013     5     3   3744       875
# ... with 10,190 more rows
  1. Flew to Houston (IAH or HOU):
> flights %>% filter(dest %in% c("IAH","HOU")) %>% 
+   select(year, month, day, flight, dest) 
# A tibble: 9,313 x 5
    year month   day flight dest 
   <int> <int> <int>  <int> <chr>
 1  2013     1     1   1545 IAH  
 2  2013     1     1   1714 IAH  
 3  2013     1     1    496 IAH  
 4  2013     1     1    473 IAH  
 5  2013     1     1   1479 IAH  
 6  2013     1     1   1220 IAH  
 7  2013     1     1   1004 IAH  
 8  2013     1     1    455 IAH  
 9  2013     1     1   1086 IAH  
10  2013     1     1   1461 IAH  
# ... with 9,303 more rows
  1. Were operated by United, American, or Delta:
> flights %>% filter(carrier %in% c("UA","AA","DL")) %>% 
+   select(year, month, day, flight, carrier) 
# A tibble: 139,504 x 5
    year month   day flight carrier
   <int> <int> <int>  <int> <chr>  
 1  2013     1     1   1545 UA     
 2  2013     1     1   1714 UA     
 3  2013     1     1   1141 AA     
 4  2013     1     1    461 DL     
 5  2013     1     1   1696 UA     
 6  2013     1     1    301 AA     
 7  2013     1     1    194 UA     
 8  2013     1     1   1124 UA     
 9  2013     1     1    707 AA     
10  2013     1     1   1187 UA     
# ... with 139,494 more rows
  1. Departed in summer (July, August, and September):
> flights %>% filter(month %in% 7:9) %>% 
+   select(year, month, day, flight) 
# A tibble: 86,326 x 4
    year month   day flight
   <int> <int> <int>  <int>
 1  2013     7     1    915
 2  2013     7     1   1503
 3  2013     7     1    234
 4  2013     7     1   1371
 5  2013     7     1    185
 6  2013     7     1    165
 7  2013     7     1    415
 8  2013     7     1    425
 9  2013     7     1   1183
10  2013     7     1    623
# ... with 86,316 more rows
  1. Arrived more that two hours late, but didn’t leave late:
> flights %>% filter(arr_delay>120 & dep_delay<=0) %>% 
+   select(year, month, day, dep_delay, arr_delay) %>% 
+   arrange(desc(dep_delay))
# A tibble: 29 x 5
    year month   day dep_delay arr_delay
   <int> <int> <int>     <dbl>     <dbl>
 1  2013    10     7         0       130
 2  2013     5    23         0       128
 3  2013     7     1         0       140
 4  2013     1    27        -1       124
 5  2013     7     7        -1       147
 6  2013     7    28        -1       129
 7  2013    10     7        -2       124
 8  2013    11     1        -2       194
 9  2013     4    18        -2       179
10  2013     6    14        -2       132
# ... with 19 more rows
  1. Were delayed by at least an hour, but made up over 30 minutes in flight:
> flights %>% filter(dep_delay>=60 & arr_delay<(dep_delay-30)) %>% 
+   select(year, month, day, dep_delay, arr_delay)
# A tibble: 1,844 x 5
    year month   day dep_delay arr_delay
   <int> <int> <int>     <dbl>     <dbl>
 1  2013     1     1       285       246
 2  2013     1     1       116        73
 3  2013     1     3       162       128
 4  2013     1     3        99        66
 5  2013     1     3        65        28
 6  2013     1     3       102        67
 7  2013     1     3        65         1
 8  2013     1     3        60        24
 9  2013     1     3       177       141
10  2013     1     4       137       105
# ... with 1,834 more rows
  1. Departed between midnight and 6 a.m. (inclusive):
> #midnight is recorded as 2400.  12:01 AM is recorded as 1.  
> #6:00 AM is recorded as 600. <=600 will capture 12:01 AM to 6:00 AM
> flights %>% filter(dep_time==2400 | dep_time<=600) %>% 
+   select(year, month, day, flight, dep_time)
# A tibble: 9,373 x 5
    year month   day flight dep_time
   <int> <int> <int>  <int>    <int>
 1  2013     1     1   1545      517
 2  2013     1     1   1714      533
 3  2013     1     1   1141      542
 4  2013     1     1    725      544
 5  2013     1     1    461      554
 6  2013     1     1   1696      554
 7  2013     1     1    507      555
 8  2013     1     1   5708      557
 9  2013     1     1     79      557
10  2013     1     1    301      558
# ... with 9,363 more rows
  1. Another useful dplyr filtering helper is between(). What does it do? Can you use it to simplify the code needed to answer the previous challenges?
  • between() is a shortcut for x >= left & x <= right
> flights %>% filter(between(month,7,9)) %>% 
+   select(year, month, day, flight) 
# A tibble: 86,326 x 4
    year month   day flight
   <int> <int> <int>  <int>
 1  2013     7     1    915
 2  2013     7     1   1503
 3  2013     7     1    234
 4  2013     7     1   1371
 5  2013     7     1    185
 6  2013     7     1    165
 7  2013     7     1    415
 8  2013     7     1    425
 9  2013     7     1   1183
10  2013     7     1    623
# ... with 86,316 more rows
  1. How many flights have a missing dep_time? What other variables are missing? What might these rows represent?
  • arr_time and the delay variables are missing
  • These represent canceled flights
> flights %>% filter(is.na(dep_time)) 
# A tibble: 8,255 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1       NA           1630        NA       NA           1815
 2  2013     1     1       NA           1935        NA       NA           2240
 3  2013     1     1       NA           1500        NA       NA           1825
 4  2013     1     1       NA            600        NA       NA            901
 5  2013     1     2       NA           1540        NA       NA           1747
 6  2013     1     2       NA           1620        NA       NA           1746
 7  2013     1     2       NA           1355        NA       NA           1459
 8  2013     1     2       NA           1420        NA       NA           1644
 9  2013     1     2       NA           1321        NA       NA           1536
10  2013     1     2       NA           1545        NA       NA           1910
# ... with 8,245 more rows, and 11 more variables: arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
  1. Why is NA ^ 0 not missing? Why is NA | TRUE not missing? Why is FALSE & NA not missing? Can you figure out the general rule? (NA * 0 is a tricky counterexample!)
  • General rule - if the NA doesn’t matter to the outcome then the result will not be an NA. If it does matter the answer will be NA.
> #anything raised to 0 equals 1. NA doesn't matter.
> NA^0
[1] 1
> #anything or true is true. NA doesn't matter.
> NA | TRUE
[1] TRUE
> #anything & false is false.  NA doesn't matter.
> NA & FALSE
[1] FALSE
> #NA could represent infinity or a number.  NA matters.
> NA*0
[1] NA
> Inf * 0
[1] NaN

Arranging Rows


arrange() can be used to to change the order of rows.

> # order by year, then month, then day
> flights %>%  arrange(year, month, day)
# A tibble: 336,776 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

desc can be used for descending order

> #by dep_delay descending
> flights%>% arrange(desc(dep_delay))
# A tibble: 336,776 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     9      641            900      1301     1242           1530
 2  2013     6    15     1432           1935      1137     1607           2120
 3  2013     1    10     1121           1635      1126     1239           1810
 4  2013     9    20     1139           1845      1014     1457           2210
 5  2013     7    22      845           1600      1005     1044           1815
 6  2013     4    10     1100           1900       960     1342           2211
 7  2013     3    17     2321            810       911      135           1020
 8  2013     6    27      959           1900       899     1236           2226
 9  2013     7    22     2257            759       898      121           1026
10  2013    12     5      756           1700       896     1058           2020
# ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Missing values are always sorted at the end unless you ask for them to be moved up.

> #missing values always sorted at the end
> samp<-tibble(x=c(9,4,NA))
> samp %>% arrange(x)
# A tibble: 3 x 1
      x
  <dbl>
1     4
2     9
3    NA
> samp %>% arrange(desc(x))
# A tibble: 3 x 1
      x
  <dbl>
1     9
2     4
3    NA
> #Can use !is.na to move NA values to the top
> samp %>% arrange(!is.na(x),x)
# A tibble: 3 x 1
      x
  <dbl>
1    NA
2     4
3     9

Arranging Rows - Exercises

  1. How could you use arrange() to sort all missing values to the start?
> samp<-tibble(x=c(9,4,NA))
> samp %>% arrange(!is.na(x),x)
# A tibble: 3 x 1
      x
  <dbl>
1    NA
2     4
3     9
  1. Sort flights to find the most delayed flights. Find the flights that left earliest.
> #most delayed
> flights %>% arrange(desc(dep_delay)) %>% 
+   select(year, month, day, flight, dep_delay)
# A tibble: 336,776 x 5
    year month   day flight dep_delay
   <int> <int> <int>  <int>     <dbl>
 1  2013     1     9     51      1301
 2  2013     6    15   3535      1137
 3  2013     1    10   3695      1126
 4  2013     9    20    177      1014
 5  2013     7    22   3075      1005
 6  2013     4    10   2391       960
 7  2013     3    17   2119       911
 8  2013     6    27   2007       899
 9  2013     7    22   2047       898
10  2013    12     5    172       896
# ... with 336,766 more rows
> #left earliest
> flights %>% arrange(dep_time) %>% 
+   select(year, month, day, flight, dep_time)
# A tibble: 336,776 x 5
    year month   day flight dep_time
   <int> <int> <int>  <int>    <int>
 1  2013     1    13     22        1
 2  2013     1    31    530        1
 3  2013    11    13   1503        1
 4  2013    12    16    839        1
 5  2013    12    20   1503        1
 6  2013    12    26   1503        1
 7  2013    12    30    839        1
 8  2013     2    11    530        1
 9  2013     2    24    608        1
10  2013     3     8    739        1
# ... with 336,766 more rows
  1. Sort flights to find the fastest flights.
> flights %>% mutate(speed=distance/air_time*60) %>% 
+   arrange(desc(speed)) %>% 
+   select(year, month, day, flight, speed)
# A tibble: 336,776 x 5
    year month   day flight speed
   <int> <int> <int>  <int> <dbl>
 1  2013     5    25   1499  703.
 2  2013     7     2   4667  650.
 3  2013     5    13   4292  648 
 4  2013     3    23   3805  641.
 5  2013     1    12   1902  591.
 6  2013    11    17    315  564 
 7  2013     2    21    707  557.
 8  2013    11    17    936  556.
 9  2013    11    16    347  554.
10  2013    11    16   1503  554.
# ... with 336,766 more rows
  1. Which flights traveled the longest? Which traveled the shortest?
> #longest
> flights %>% arrange(desc(distance)) %>% 
+   select(year, month, day, flight, distance, origin, dest)
# A tibble: 336,776 x 7
    year month   day flight distance origin dest 
   <int> <int> <int>  <int>    <dbl> <chr>  <chr>
 1  2013     1     1     51     4983 JFK    HNL  
 2  2013     1     2     51     4983 JFK    HNL  
 3  2013     1     3     51     4983 JFK    HNL  
 4  2013     1     4     51     4983 JFK    HNL  
 5  2013     1     5     51     4983 JFK    HNL  
 6  2013     1     6     51     4983 JFK    HNL  
 7  2013     1     7     51     4983 JFK    HNL  
 8  2013     1     8     51     4983 JFK    HNL  
 9  2013     1     9     51     4983 JFK    HNL  
10  2013     1    10     51     4983 JFK    HNL  
# ... with 336,766 more rows
> #shortest
> flights %>% arrange(distance) %>% 
+   select(year, month, day, flight, distance, origin, dest)
# A tibble: 336,776 x 7
    year month   day flight distance origin dest 
   <int> <int> <int>  <int>    <dbl> <chr>  <chr>
 1  2013     7    27   1632       17 EWR    LGA  
 2  2013     1     3   3833       80 EWR    PHL  
 3  2013     1     4   4193       80 EWR    PHL  
 4  2013     1     4   4502       80 EWR    PHL  
 5  2013     1     4   4645       80 EWR    PHL  
 6  2013     1     5   4193       80 EWR    PHL  
 7  2013     1     6   4619       80 EWR    PHL  
 8  2013     1     7   4619       80 EWR    PHL  
 9  2013     1     8   4619       80 EWR    PHL  
10  2013     1     9   4619       80 EWR    PHL  
# ... with 336,766 more rows

Selecting Columns


select() allows you to choose columns.

> #Select year, month, day colummns
> flights %>% select(year,month,day)
# A tibble: 336,776 x 3
    year month   day
   <int> <int> <int>
 1  2013     1     1
 2  2013     1     1
 3  2013     1     1
 4  2013     1     1
 5  2013     1     1
 6  2013     1     1
 7  2013     1     1
 8  2013     1     1
 9  2013     1     1
10  2013     1     1
# ... with 336,766 more rows
> #Select all columns between year to day (inclusive)
> flights %>% select(year:day)
# A tibble: 336,776 x 3
    year month   day
   <int> <int> <int>
 1  2013     1     1
 2  2013     1     1
 3  2013     1     1
 4  2013     1     1
 5  2013     1     1
 6  2013     1     1
 7  2013     1     1
 8  2013     1     1
 9  2013     1     1
10  2013     1     1
# ... with 336,766 more rows
> #Select all columns except year to day (inclusive)
> flights %>% select(-(year:day))
# A tibble: 336,776 x 16
   dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
      <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>  
 1      517            515         2      830            819        11 UA     
 2      533            529         4      850            830        20 UA     
 3      542            540         2      923            850        33 AA     
 4      544            545        -1     1004           1022       -18 B6     
 5      554            600        -6      812            837       -25 DL     
 6      554            558        -4      740            728        12 UA     
 7      555            600        -5      913            854        19 B6     
 8      557            600        -3      709            723       -14 EV     
 9      557            600        -3      838            846        -8 B6     
10      558            600        -2      753            745         8 AA     
# ... with 336,766 more rows, and 9 more variables: flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Helper Functions

select() can also be used with helper functions such as

  • starts_with(“x”) - begins with x
  • ends_with(“x”) - ends with x
  • contains(“x”) - contains x
  • matches() - matches a regular expression
  • num_range(“x”,1:3) - matches x1, x2, x3
  • everything() - every column not already selected
  • last_col() - last variable, possibly with an offset
  • all_of() - matches variable names in a character vector
  • any_of() - same as all_of but no error if one is missing
  • where() - applies a function to variables and includes where TRUE

The entire list can be viewed with ?select.

Some examples using the iris dataset are provided below.

> # convert iris datset to tible for viewing
> iris <- as_tibble(iris)
> #select columns that don't end in Width
> iris %>% select(!ends_with("Width"))
# A tibble: 150 x 3
   Sepal.Length Petal.Length Species
          <dbl>        <dbl> <fct>  
 1          5.1          1.4 setosa 
 2          4.9          1.4 setosa 
 3          4.7          1.3 setosa 
 4          4.6          1.5 setosa 
 5          5            1.4 setosa 
 6          5.4          1.7 setosa 
 7          4.6          1.4 setosa 
 8          5            1.5 setosa 
 9          4.4          1.4 setosa 
10          4.9          1.5 setosa 
# ... with 140 more rows
> #starts with Petal and ends with Width
> iris %>% select(starts_with("Petal") & ends_with("Width"))
# A tibble: 150 x 1
   Petal.Width
         <dbl>
 1         0.2
 2         0.2
 3         0.2
 4         0.2
 5         0.2
 6         0.4
 7         0.3
 8         0.2
 9         0.2
10         0.1
# ... with 140 more rows
> #starts with Petal or ends with Width
> iris %>% select(starts_with("Petal") | ends_with("Width"))
# A tibble: 150 x 3
   Petal.Length Petal.Width Sepal.Width
          <dbl>       <dbl>       <dbl>
 1          1.4         0.2         3.5
 2          1.4         0.2         3  
 3          1.3         0.2         3.2
 4          1.5         0.2         3.1
 5          1.4         0.2         3.6
 6          1.7         0.4         3.9
 7          1.4         0.3         3.4
 8          1.5         0.2         3.4
 9          1.4         0.2         2.9
10          1.5         0.1         3.1
# ... with 140 more rows
> #starts with Petal and does not end with Width
> iris %>% select(starts_with("Petal") & !ends_with("Width"))
# A tibble: 150 x 1
   Petal.Length
          <dbl>
 1          1.4
 2          1.4
 3          1.3
 4          1.5
 5          1.4
 6          1.7
 7          1.4
 8          1.5
 9          1.4
10          1.5
# ... with 140 more rows

If you want to move columns to the front everything() is useful.

> head(iris,1)
# A tibble: 1 x 5
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
         <dbl>       <dbl>        <dbl>       <dbl> <fct>  
1          5.1         3.5          1.4         0.2 setosa 
> #move Species to the front
> iris %>% select(Species, everything())
# A tibble: 150 x 5
   Species Sepal.Length Sepal.Width Petal.Length Petal.Width
   <fct>          <dbl>       <dbl>        <dbl>       <dbl>
 1 setosa           5.1         3.5          1.4         0.2
 2 setosa           4.9         3            1.4         0.2
 3 setosa           4.7         3.2          1.3         0.2
 4 setosa           4.6         3.1          1.5         0.2
 5 setosa           5           3.6          1.4         0.2
 6 setosa           5.4         3.9          1.7         0.4
 7 setosa           4.6         3.4          1.4         0.3
 8 setosa           5           3.4          1.5         0.2
 9 setosa           4.4         2.9          1.4         0.2
10 setosa           4.9         3.1          1.5         0.1
# ... with 140 more rows
> #move Species to the front, keep everything but Sepal Length
> iris %>% select(Species, everything(),-Sepal.Length)
# A tibble: 150 x 4
   Species Sepal.Width Petal.Length Petal.Width
   <fct>         <dbl>        <dbl>       <dbl>
 1 setosa          3.5          1.4         0.2
 2 setosa          3            1.4         0.2
 3 setosa          3.2          1.3         0.2
 4 setosa          3.1          1.5         0.2
 5 setosa          3.6          1.4         0.2
 6 setosa          3.9          1.7         0.4
 7 setosa          3.4          1.4         0.3
 8 setosa          3.4          1.5         0.2
 9 setosa          2.9          1.4         0.2
10 setosa          3.1          1.5         0.1
# ... with 140 more rows

Selecting Columns - Exercises

  1. Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.
  • These are just some of many:
> flights %>% select(4,6,7,9)
> flights %>% select(dep_time, dep_delay, arr_time, arr_delay)
> flights %>% select(dep_time:arr_delay,-sched_dep_time,-sched_arr_time)
> flights %>% select(-(year:day),-(carrier:time_hour),
+                    -sched_dep_time,-sched_arr_time)
> flights %>% select(starts_with("dep")|starts_with("arr"))
> flights %>% select((ends_with("time")|ends_with("delay"))
+                    &!starts_with("sched")&!starts_with("air"))
> flights %>% select(matches("^(dep|arr)_(time|delay)$"))
> flights %>% select(all_of(c("dep_time", "dep_delay", 
+                             "arr_time", "arr_delay")))
# A tibble: 336,776 x 4
   dep_time dep_delay arr_time arr_delay
      <int>     <dbl>    <int>     <dbl>
 1      517         2      830        11
 2      533         4      850        20
 3      542         2      923        33
 4      544        -1     1004       -18
 5      554        -6      812       -25
 6      554        -4      740        12
 7      555        -5      913        19
 8      557        -3      709       -14
 9      557        -3      838        -8
10      558        -2      753         8
# ... with 336,766 more rows
  1. What happens if you include the name of a variable multiple times in a select() call?.
  • It does not duplicate the variable. This is helpful when moving a variable forward and using everything().
> flights %>% select(day,year,year,month,year,everything())
# A tibble: 336,776 x 19
     day  year month dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1     1  2013     1      517            515         2      830            819
 2     1  2013     1      533            529         4      850            830
 3     1  2013     1      542            540         2      923            850
 4     1  2013     1      544            545        -1     1004           1022
 5     1  2013     1      554            600        -6      812            837
 6     1  2013     1      554            558        -4      740            728
 7     1  2013     1      555            600        -5      913            854
 8     1  2013     1      557            600        -3      709            723
 9     1  2013     1      557            600        -3      838            846
10     1  2013     1      558            600        -2      753            745
# ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
  1. What does all_of or any_of do? Why might it be helpful in conjunction with the following vector?.
  • They allow you to select variables contained in a character vector.
> vars <- c("year","month","day","dep_delay","arr_delay","made_up")
> 
> #any_of doesn't check for missing variables
> flights %>% select(any_of(vars))
# A tibble: 336,776 x 5
    year month   day dep_delay arr_delay
   <int> <int> <int>     <dbl>     <dbl>
 1  2013     1     1         2        11
 2  2013     1     1         4        20
 3  2013     1     1         2        33
 4  2013     1     1        -1       -18
 5  2013     1     1        -6       -25
 6  2013     1     1        -4        12
 7  2013     1     1        -5        19
 8  2013     1     1        -3       -14
 9  2013     1     1        -3        -8
10  2013     1     1        -2         8
# ... with 336,766 more rows
> #With all_of if any of the variables in the character vector is missing 
> #an error is produced.
> flights %>% select(all_of(vars))
  1. Does the result of running the following code surprise you? select(flights,contains("TIME")) How do the select helpers deal with case by default? How can you change the default?.
  • By default it ignores case. It can be changed with ignore.case=FALSE.
> select(flights,contains("TIME"))
# A tibble: 336,776 x 6
   dep_time sched_dep_time arr_time sched_arr_time air_time time_hour          
      <int>          <int>    <int>          <int>    <dbl> <dttm>             
 1      517            515      830            819      227 2013-01-01 05:00:00
 2      533            529      850            830      227 2013-01-01 05:00:00
 3      542            540      923            850      160 2013-01-01 05:00:00
 4      544            545     1004           1022      183 2013-01-01 05:00:00
 5      554            600      812            837      116 2013-01-01 06:00:00
 6      554            558      740            728      150 2013-01-01 05:00:00
 7      555            600      913            854      158 2013-01-01 06:00:00
 8      557            600      709            723       53 2013-01-01 06:00:00
 9      557            600      838            846      140 2013-01-01 06:00:00
10      558            600      753            745      138 2013-01-01 06:00:00
# ... with 336,766 more rows
> select(flights,contains("TIME", ignore.case = FALSE))
# A tibble: 336,776 x 0

Renaming


rename() can be used to rename columns

> flights %>% rename(DeptTime = dep_time) 
# A tibble: 336,776 x 19
    year month   day DeptTime sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Creating New Variables


You can add new columns with mutate(). The new variables are always placed at the end.

> #smaller dataset
> flights_sml <- select(flights, year:day,ends_with("delay"),
+                       distance,air_time)
> #add gain and speed
> flights_sml %>% mutate(gain=arr_delay-dep_delay,
+        speed=distance/air_time*60)
# A tibble: 336,776 x 9
    year month   day dep_delay arr_delay distance air_time  gain speed
   <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
 1  2013     1     1         2        11     1400      227     9  370.
 2  2013     1     1         4        20     1416      227    16  374.
 3  2013     1     1         2        33     1089      160    31  408.
 4  2013     1     1        -1       -18     1576      183   -17  517.
 5  2013     1     1        -6       -25      762      116   -19  394.
 6  2013     1     1        -4        12      719      150    16  288.
 7  2013     1     1        -5        19     1065      158    24  404.
 8  2013     1     1        -3       -14      229       53   -11  259.
 9  2013     1     1        -3        -8      944      140    -5  405.
10  2013     1     1        -2         8      733      138    10  319.
# ... with 336,766 more rows

You can also refer to columns that were just created.

> #can refer to columns just created
> flights_sml %>% mutate(gain=arr_delay-dep_delay,
+        hours=air_time/60, gain_per_hour=gain/hours)
# A tibble: 336,776 x 10
    year month   day dep_delay arr_delay distance air_time  gain hours
   <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
 1  2013     1     1         2        11     1400      227     9 3.78 
 2  2013     1     1         4        20     1416      227    16 3.78 
 3  2013     1     1         2        33     1089      160    31 2.67 
 4  2013     1     1        -1       -18     1576      183   -17 3.05 
 5  2013     1     1        -6       -25      762      116   -19 1.93 
 6  2013     1     1        -4        12      719      150    16 2.5  
 7  2013     1     1        -5        19     1065      158    24 2.63 
 8  2013     1     1        -3       -14      229       53   -11 0.883
 9  2013     1     1        -3        -8      944      140    -5 2.33 
10  2013     1     1        -2         8      733      138    10 2.3  
# ... with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>

transmute() will only keep the new variables.

> # only keep new variables use transmute
> flights %>% transmute(gain=arr_delay-dep_delay, hours=air_time/60,
+           gain_per_hour=gain/hours)
# A tibble: 336,776 x 3
    gain hours gain_per_hour
   <dbl> <dbl>         <dbl>
 1     9 3.78           2.38
 2    16 3.78           4.23
 3    31 2.67          11.6 
 4   -17 3.05          -5.57
 5   -19 1.93          -9.83
 6    16 2.5            6.4 
 7    24 2.63           9.11
 8   -11 0.883        -12.5 
 9    -5 2.33          -2.14
10    10 2.3            4.35
# ... with 336,766 more rows

Creation Functions

You can create new variables with the standard arithmetic operators, +,-,*,/,^, or with modular operators,%/%(integer division) and %%(remainder).

> flights %>% transmute(dep_time, hour=dep_time%/%100,
+           minute=dep_time%%100)
# A tibble: 336,776 x 3
   dep_time  hour minute
      <int> <dbl>  <dbl>
 1      517     5     17
 2      533     5     33
 3      542     5     42
 4      544     5     44
 5      554     5     54
 6      554     5     54
 7      555     5     55
 8      557     5     57
 9      557     5     57
10      558     5     58
# ... with 336,766 more rows

You can also used logs, such as log(),log2(),and log(10)

log2() is easy to interpret because a difference of 1 is equal to a doubling on the original scale.

> #log2 a difference of 1 is a doubling on original scale
> log2(50)
[1] 5.643856
> log2(100)
[1] 6.643856

Offsets like lead() and lag() could be useful in calculating changes.

> (x<- 1:10)
 [1]  1  2  3  4  5  6  7  8  9 10
> lag(x)
 [1] NA  1  2  3  4  5  6  7  8  9
> lead(x)
 [1]  2  3  4  5  6  7  8  9 10 NA

The following cumulative calculations are available:

> x
 [1]  1  2  3  4  5  6  7  8  9 10
> cumsum(x)
 [1]  1  3  6 10 15 21 28 36 45 55
> cummean(x)
 [1] 1.000000 1.000000 1.333333 1.750000 2.200000 2.666667 3.142857 3.625000
 [9] 4.111111 4.600000
> cummin(x)
 [1] 1 1 1 1 1 1 1 1 1 1
> cummax(x)
 [1]  1  2  3  4  5  6  7  8  9 10
> cumprod(x)
 [1]       1       2       6      24     120     720    5040   40320  362880
[10] 3628800

The package RcppRoll can be used for rolling aggregates.

Ranking

min_rank provides a ranking with the smallest values given to the smallest ranks. desc(x) gives the largest values to the smallest ranks.

> #ranking
> y <-c(2,3,3,NA,5,6)
> min_rank(y)
[1]  1  2  2 NA  4  5
> min_rank(desc(y))
[1]  5  3  3 NA  2  1
  • row_number(): equivalent to rank(ties.method = "first")
  • min_rank(): equivalent to rank(ties.method = "min")
  • dense_rank(): like min_rank(), but with no gaps between ranks
  • percent_rank(): a number between 0 and 1 computed by rescaling min_rank to [0, 1]
  • cume_dist(): a cumulative distribution function. Proportion of all values less than or equal to the current rank.
  • ntile(): a rough rank, which breaks the input vector into n buckets. The size of the buckets may differ by up to one, larger buckets have lower rank.
> x <- c(5, 1, 3, 2, 2, NA)
> row_number(x)
[1]  5  1  4  2  3 NA
> min_rank(x)
[1]  5  1  4  2  2 NA
> dense_rank(x)
[1]  4  1  3  2  2 NA
> percent_rank(x)
[1] 1.00 0.00 0.75 0.25 0.25   NA
> cume_dist(x)
[1] 1.0 0.2 0.8 0.6 0.6  NA
> ntile(x, 2)
[1]  2  1  2  1  1 NA
> ntile(1:8, 3)
[1] 1 1 1 2 2 2 3 3

Creating New Variables - Exercises

  1. Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight.
  • Midnight is 2400. Exactly midnight should be represented as 0, not 1440.
  • This can be solved by using %%1440, the remainder when dividing by 1440.
> #issue with midnight being 2400
> flights %>% select(flight,dep_time,sched_dep_time) %>% 
+   mutate(dep_time_min=(dep_time%/%100)*60+dep_time%%100,
+          sched_dep_time_min=(sched_dep_time%/%100)*60+sched_dep_time%%100) %>% 
+   arrange(desc(dep_time_min))
# A tibble: 336,776 x 5
   flight dep_time sched_dep_time dep_time_min sched_dep_time_min
    <int>    <int>          <int>        <dbl>              <dbl>
 1    839     2400           2359         1440               1439
 2    745     2400           2359         1440               1439
 3   1503     2400           2359         1440               1439
 4   1503     2400           2359         1440               1439
 5   1816     2400           2250         1440               1370
 6   1503     2400           2359         1440               1439
 7   1503     2400           2359         1440               1439
 8   2379     2400           1700         1440               1020
 9    727     2400           2359         1440               1439
10    739     2400           2359         1440               1439
# ... with 336,766 more rows
> #longest time is 1 minute before midnight
> flights %>% select(flight,dep_time,sched_dep_time) %>% 
+   mutate(dep_time_min=(dep_time%/%100*60+dep_time%%100)%%1440,
+          sched_dep_time_min=(sched_dep_time%/%100*60+sched_dep_time%%100)%%1440) %>% 
+   arrange(desc(dep_time_min))
# A tibble: 336,776 x 5
   flight dep_time sched_dep_time dep_time_min sched_dep_time_min
    <int>    <int>          <int>        <dbl>              <dbl>
 1    727     2359           2359         1439               1439
 2    727     2359           2359         1439               1439
 3    701     2359           2130         1439               1290
 4    727     2359           2359         1439               1439
 5    739     2359           2359         1439               1439
 6    739     2359           2359         1439               1439
 7   5811     2359           2110         1439               1270
 8    839     2359           2359         1439               1439
 9   1503     2359           2359         1439               1439
10    839     2359           2359         1439               1439
# ... with 336,766 more rows
  1. Compare air_time with arr_time - dep_time. What do you see? What do you expect to see?.
  • air_time_diff is the difference between air_time and the calculation using arrival and departure times.

  • Problem 1 - crossing over midnight adds 1440 to air_time_diff.

  • Problem 2 - Timezone changes could add 60, 120, or 180 to air_time_diff.

  • Problem 3 - We can see from the plot that there are many differences other than 60, 120, 180, or 1440.

air_time does not include time spent on the runway, which results in Problem 3. Time on the runway would be included in arr_time and dep_time.

> flights_time <- flights %>% 
+   mutate(dep_time_min=(dep_time%/%100*60+dep_time%%100)%%1440,
+          arr_time_min=(arr_time%/%100*60+arr_time%%100)%%1440,
+          time_diff=arr_time_min-dep_time_min,
+          air_time_diff=air_time-time_diff) %>% 
+   select(dep_time_min,arr_time_min,time_diff,air_time,air_time_diff) %>% 
+   arrange(desc(air_time_diff))
> 
> flights_time
# A tibble: 336,776 x 5
   dep_time_min arr_time_min time_diff air_time air_time_diff
          <dbl>        <dbl>     <dbl>    <dbl>         <dbl>
 1         1075           34     -1041      648          1689
 2         1270            1     -1269      340          1609
 3         1302           32     -1270      336          1606
 4         1338           24     -1314      292          1606
 5         1305           23     -1282      324          1606
 6         1426          133     -1293      313          1606
 7         1376           64     -1312      294          1606
 8         1406           91     -1315      290          1605
 9         1358           81     -1277      328          1605
10         1300           31     -1269      336          1605
# ... with 336,766 more rows
> #plot of air_time_diff
> ggplot(flights_time, aes(x = air_time_diff)) +
+   geom_histogram(binwidth = 1, fill="firebrick") +theme_light()

  1. Compare dep-time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related?
  • You would expect that the difference between sched_dep_time and dep_time would equal dep_delay.
  • However, sometimes the dep_time crosses over midnight adding 1440 to the calculation.
> FlightsDepTime<- flights %>% 
+   mutate(dep_time_min=(dep_time%/%100*60+dep_time%%100)%%1440,
+          sched_dep_time_min=(sched_dep_time%/%100*60+sched_dep_time%%100)%%1440,
+          dep_delay_diff=dep_delay-dep_time_min+sched_dep_time_min) %>% 
+   select(dep_time_min,sched_dep_time_min,dep_delay,dep_delay_diff)
> 
> FlightsDepTime
# A tibble: 336,776 x 4
   dep_time_min sched_dep_time_min dep_delay dep_delay_diff
          <dbl>              <dbl>     <dbl>          <dbl>
 1          317                315         2              0
 2          333                329         4              0
 3          342                340         2              0
 4          344                345        -1              0
 5          354                360        -6              0
 6          354                358        -4              0
 7          355                360        -5              0
 8          357                360        -3              0
 9          357                360        -3              0
10          358                360        -2              0
# ... with 336,766 more rows
> ggplot(filter(FlightsDepTime, dep_delay_diff > 0),
+   aes(y = sched_dep_time_min, x = dep_delay_diff)) +
+   geom_point(color="firebrick")+theme_light()

  1. Find the 10 most delayed flights using a ranking function. How do you want to handle ties?
  • min_rank(): equivalent to rank(ties.method = “min”)
> flights %>% mutate(dep_rank=min_rank(desc(dep_delay))) %>% 
+   select(flight,dep_delay,dep_rank) %>% arrange(desc(dep_delay))
# A tibble: 336,776 x 3
   flight dep_delay dep_rank
    <int>     <dbl>    <int>
 1     51      1301        1
 2   3535      1137        2
 3   3695      1126        3
 4    177      1014        4
 5   3075      1005        5
 6   2391       960        6
 7   2119       911        7
 8   2007       899        8
 9   2047       898        9
10    172       896       10
# ... with 336,766 more rows
  1. What does 1:3 + 1:10 return? Why?
  • It returns a warning because the longer object is not a multiple of the shorter object.
  • It recycles the shorter object, which will cut off because of the length difference.
> 1:3 + 1:10
Warning in 1:3 + 1:10: longer object length is not a multiple of shorter object
length
 [1]  2  4  6  5  7  9  8 10 12 11
> #Equivalent to:
> c(1 + 1, 2 + 2, 3 + 3, 1 + 4, 2 + 5, 3 + 6, 1 + 7, 2 + 8, 3 + 9, 1 + 10)
 [1]  2  4  6  5  7  9  8 10 12 11
  1. What trigonometric fucntions does R provide?
  • Use ?Trig to find out.
  • cosine, sine, tangent, arc-cosine, arc-sine, arc-tangent, and the two-argument arc-tangent.

Grouping and Summarizing


summarize() can be used to obtain a single datapoint or to generate data based on a grouping.

> #mean dep_delay
> flights %>% summarize(delay=mean(dep_delay,na.rm=TRUE))
# A tibble: 1 x 1
  delay
  <dbl>
1  12.6

To get the mean departure delay for each day the data has to be grouped first.

> # group to obtain daily mean
> flights %>% group_by(year,month,day) %>%
+ summarize(delay=mean(dep_delay,na.rm=TRUE))
# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day delay
   <int> <int> <int> <dbl>
 1  2013     1     1 11.5 
 2  2013     1     2 13.9 
 3  2013     1     3 11.0 
 4  2013     1     4  8.95
 5  2013     1     5  5.73
 6  2013     1     6  7.15
 7  2013     1     7  5.42
 8  2013     1     8  2.55
 9  2013     1     9  2.28
10  2013     1    10  2.84
# ... with 355 more rows

The mean arrival delay at first increases with average distance and then it decreases. Flights can likely make up time in the air during longer distances.

> #count, avg. distance, and avg. arrival delay by destination
> delays <- flights %>% 
+   group_by(dest) %>%
+   summarize(count=n(),dist=mean(distance,na.rm=TRUE),
+             delay=mean(arr_delay,na.rm=TRUE)) %>%
+   filter(count>20, dest !="HNL")
> 
> #plotted
> ggplot(data=delays, mapping=aes(x=dist,y=delay))+
+   geom_point(aes(size=count),alpha=1/3, color='firebrick')+
+   geom_smooth(se=FALSE)

Missing Values

Missing values must be removed or they will populate the entire column.

> flights %>%
+   group_by(year,month,day)%>%
+   summarize(mean=mean(dep_delay))
# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day  mean
   <int> <int> <int> <dbl>
 1  2013     1     1    NA
 2  2013     1     2    NA
 3  2013     1     3    NA
 4  2013     1     4    NA
 5  2013     1     5    NA
 6  2013     1     6    NA
 7  2013     1     7    NA
 8  2013     1     8    NA
 9  2013     1     9    NA
10  2013     1    10    NA
# ... with 355 more rows

Fortunately it is an easy fix.

> #with NAs removed
> flights %>%
+   group_by(year,month,day)%>%
+   summarize(mean=mean(dep_delay, na.rm=TRUE))
# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day  mean
   <int> <int> <int> <dbl>
 1  2013     1     1 11.5 
 2  2013     1     2 13.9 
 3  2013     1     3 11.0 
 4  2013     1     4  8.95
 5  2013     1     5  5.73
 6  2013     1     6  7.15
 7  2013     1     7  5.42
 8  2013     1     8  2.55
 9  2013     1     9  2.28
10  2013     1    10  2.84
# ... with 355 more rows

Or you could filter out the NA values before summarizing.

> #filter NA values first
> not_cancelled <-flights %>%
+   filter(!is.na(dep_delay), !is.na(arr_delay))
> not_cancelled %>% group_by(year,month,day)%>%
+   summarize(mean=mean(dep_delay))
# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day  mean
   <int> <int> <int> <dbl>
 1  2013     1     1 11.4 
 2  2013     1     2 13.7 
 3  2013     1     3 10.9 
 4  2013     1     4  8.97
 5  2013     1     5  5.73
 6  2013     1     6  7.15
 7  2013     1     7  5.42
 8  2013     1     8  2.56
 9  2013     1     9  2.30
10  2013     1    10  2.84
# ... with 355 more rows

If you want to omit all rows that have missing values you can use na.omit.

> FlightsClean <- flights %>% na.omit

Counts

Including counts is helpful, especially if there’s a chance that some calculations mught be based on a very small number of observations.

  • count values = n()
  • count non-missing values = sum(!is.na(x))

In the following plot we can see that some planes have an average delay of 300 minutes.

> #average arrival delay
> delays<- not_cancelled %>%
+   group_by(tailnum) %>%
+   summarize(delay=mean(arr_delay))
> 
> ggplot(data=delays,mapping=aes(x=delay))+
+   geom_freqpoly(binwidth=10, color='firebrick')

However, when count is included we can see that the high variation takes place with a small number of flights.

> delays <-not_cancelled %>%
+   group_by(tailnum) %>%
+   summarize(delay=mean(arr_delay,na.rm=TRUE),n=n())
> 
> ggplot(data=delays,mapping=aes(x=n, y=delay))+
+   geom_point(alpha=1/10, color='firebrick')

There is much less variation when the small counts are removed.

> #filter out small counts
> delays %>%
+   filter(n>25) %>%
+   ggplot(mapping=aes(x=n, y=delay))+
+   geom_point(alpha=1/10, color='firebrick')

This can also be demonstrated with the Batting dataset.

> #convert Batting dataset to tibble
> batting <- as_tibble(Lahman::Batting)
> 
> #gather batting average and at bats
> batters<- batting %>%
+   group_by(playerID) %>%
+   summarize(
+     ba=sum(H, na.rm=TRUE)/sum(AB, na.rm=TRUE),
+     ab=sum(AB, na.rm=TRUE)
+   )
> 
> #filter and chart
> batters %>% 
+   filter(ab>100) %>%
+   ggplot(mapping=aes(x=ab, y=ba))+
+   geom_point()+geom_smooth(se=FALSE)

  • The variation decreases as at-bats increases
  • There is positive correlation. Better players are selected to hit more.

If you didn’t filter for very low count, you would see that the best batting averages were related to luck.

> batters %>%
+   arrange(desc(ba))
# A tibble: 19,689 x 3
   playerID     ba    ab
   <chr>     <dbl> <int>
 1 abramge01     1     1
 2 alanirj01     1     1
 3 alberan01     1     1
 4 banisje01     1     1
 5 bartocl01     1     1
 6 bassdo01      1     1
 7 birasst01     1     2
 8 bruneju01     1     1
 9 burnscb01     1     1
10 cammaer01     1     1
# ... with 19,679 more rows

Summary Functions

There are also many other useful summary functions, such as:

  • mean(x) - average
  • median(x) - median
  • sd(x) - standard deviation
  • IQR(x) - interquartile range
  • mad(x) - median absoulte deviation
  • min(x) - minimum
  • quantile(x, 25) - value of x greater than 25% of values and less than 75% of the remaing values
  • max(x) - maximum
  • first(x) - first
  • nth(x, 2) - second
  • last(x) - last
  • n_distinct(x) - number of unique values

You can also combine aggregation with subsetting, as shown below:

> not_cancelled %>%
+   group_by(year, month, day) %>%
+   summarize(
+     #average delay:
+     avg_delay1 = mean(arr_delay),
+     #average positive delay
+     avg_delay2 = mean(arr_delay[arr_delay>0])
+   )
# A tibble: 365 x 5
# Groups:   year, month [12]
    year month   day avg_delay1 avg_delay2
   <int> <int> <int>      <dbl>      <dbl>
 1  2013     1     1     12.7         32.5
 2  2013     1     2     12.7         32.0
 3  2013     1     3      5.73        27.7
 4  2013     1     4     -1.93        28.3
 5  2013     1     5     -1.53        22.6
 6  2013     1     6      4.24        24.4
 7  2013     1     7     -4.95        27.8
 8  2013     1     8     -3.23        20.8
 9  2013     1     9     -0.264       25.6
10  2013     1    10     -5.90        27.3
# ... with 355 more rows

Using standard deviation:

> #distance to some destinations is more variable
> not_cancelled %>%
+   group_by(dest) %>%
+   summarize(distance_sd=sd(distance)) %>%
+   arrange(desc(distance_sd))
# A tibble: 104 x 2
   dest  distance_sd
   <chr>       <dbl>
 1 EGE         10.5 
 2 SAN         10.4 
 3 SFO         10.2 
 4 HNL         10.0 
 5 SEA          9.98
 6 LAS          9.91
 7 PDX          9.87
 8 PHX          9.86
 9 LAX          9.66
10 IND          9.46
# ... with 94 more rows

Measures of rank:

> #first and last flights of the day
> not_cancelled %>%
+   group_by(year, month, day) %>%
+   summarize(
+     first=min(dep_time),
+     last=max(dep_time)
+   )
# A tibble: 365 x 5
# Groups:   year, month [12]
    year month   day first  last
   <int> <int> <int> <int> <int>
 1  2013     1     1   517  2356
 2  2013     1     2    42  2354
 3  2013     1     3    32  2349
 4  2013     1     4    25  2358
 5  2013     1     5    14  2357
 6  2013     1     6    16  2355
 7  2013     1     7    49  2359
 8  2013     1     8   454  2351
 9  2013     1     9     2  2252
10  2013     1    10     3  2320
# ... with 355 more rows

Measures of position:

> #first and last flights of the day
> not_cancelled %>%
+   group_by(year, month, day) %>%
+   summarize(
+     first_dep=first(dep_time),
+     last_dep=last(dep_time)
+   )
# A tibble: 365 x 5
# Groups:   year, month [12]
    year month   day first_dep last_dep
   <int> <int> <int>     <int>    <int>
 1  2013     1     1       517     2356
 2  2013     1     2        42     2354
 3  2013     1     3        32     2349
 4  2013     1     4        25     2358
 5  2013     1     5        14     2357
 6  2013     1     6        16     2355
 7  2013     1     7        49     2359
 8  2013     1     8       454     2351
 9  2013     1     9         2     2252
10  2013     1    10         3     2320
# ... with 355 more rows

The first and last flights can be retrieved in several ways. Another is provided below:

> #first and last flight of the day
> not_cancelled %>%
+   group_by(year, month, day) %>%
+   #highest dep_time get lowest rank
+   mutate(r=min_rank(desc(dep_time))) %>%
+   # %in% range(r) displays highest and lowest
+   filter(r %in% range(r)) %>%
+   select(year,month,day,dep_time,r)
# A tibble: 770 x 5
# Groups:   year, month, day [365]
    year month   day dep_time     r
   <int> <int> <int>    <int> <int>
 1  2013     1     1      517   831
 2  2013     1     1     2356     1
 3  2013     1     2       42   928
 4  2013     1     2     2354     1
 5  2013     1     3       32   900
 6  2013     1     3     2349     1
 7  2013     1     4       25   908
 8  2013     1     4     2358     1
 9  2013     1     4     2358     1
10  2013     1     5       14   717
# ... with 760 more rows

Distinct counts:

> # destinations by most distinct carriers
> not_cancelled %>%
+   group_by(dest) %>%
+   summarize(carriers=n_distinct(carrier))%>%
+   arrange(desc(carriers))
# A tibble: 104 x 2
   dest  carriers
   <chr>    <int>
 1 ATL          7
 2 BOS          7
 3 CLT          7
 4 ORD          7
 5 TPA          7
 6 AUS          6
 7 DCA          6
 8 DTW          6
 9 IAD          6
10 MSP          6
# ... with 94 more rows

A simple count:

> not_cancelled %>%
+   count(dest)
# A tibble: 104 x 2
   dest      n
   <chr> <int>
 1 ABQ     254
 2 ACK     264
 3 ALB     418
 4 ANC       8
 5 ATL   16837
 6 AUS    2411
 7 AVL     261
 8 BDL     412
 9 BGR     358
10 BHM     269
# ... with 94 more rows

A weighting can be applied to a count. In this case we can view total miles flown for each plane:

> #weighting - total miles flown
> not_cancelled %>%
+   count(tailnum, wt=distance)
# A tibble: 4,037 x 2
   tailnum      n
   <chr>    <dbl>
 1 D942DN    3418
 2 N0EGMQ  239143
 3 N10156  109664
 4 N102UW   25722
 5 N103US   24619
 6 N104UW   24616
 7 N10575  139903
 8 N105UW   23618
 9 N107US   21677
10 N108UW   32070
# ... with 4,027 more rows

You can also use logical values, where TRUE=1 and FALSE=0. Here we can find the number of flights that left before 5 AM:

> not_cancelled %>%
+   group_by(year,month,day) %>%
+   summarize(n_early = sum(dep_time<500))
# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day n_early
   <int> <int> <int>   <int>
 1  2013     1     1       0
 2  2013     1     2       3
 3  2013     1     3       4
 4  2013     1     4       3
 5  2013     1     5       3
 6  2013     1     6       2
 7  2013     1     7       2
 8  2013     1     8       1
 9  2013     1     9       3
10  2013     1    10       3
# ... with 355 more rows

Proportions are also obtainable. Here is the proportion of flights delayed by more than an hour:

> not_cancelled %>%
+   group_by(year, month, day)%>%
+   summarize(hour_perc=mean(arr_delay >60))
# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day hour_perc
   <int> <int> <int>     <dbl>
 1  2013     1     1    0.0722
 2  2013     1     2    0.0851
 3  2013     1     3    0.0567
 4  2013     1     4    0.0396
 5  2013     1     5    0.0349
 6  2013     1     6    0.0470
 7  2013     1     7    0.0333
 8  2013     1     8    0.0213
 9  2013     1     9    0.0202
10  2013     1    10    0.0183
# ... with 355 more rows

Grouping by Multiple Variables

Grouping by multiple variables allows you to progressively summarize each level.

> #total flights per day
> daily <- flights %>% group_by(year,month,day)
> (per_day <- daily %>% summarize(flights=n()))
# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day flights
   <int> <int> <int>   <int>
 1  2013     1     1     842
 2  2013     1     2     943
 3  2013     1     3     914
 4  2013     1     4     915
 5  2013     1     5     720
 6  2013     1     6     832
 7  2013     1     7     933
 8  2013     1     8     899
 9  2013     1     9     902
10  2013     1    10     932
# ... with 355 more rows
> #total flights per month
> (per_month <- per_day %>% 
+     summarize(flights=sum(flights)))
# A tibble: 12 x 3
# Groups:   year [1]
    year month flights
   <int> <int>   <int>
 1  2013     1   27004
 2  2013     2   24951
 3  2013     3   28834
 4  2013     4   28330
 5  2013     5   28796
 6  2013     6   28243
 7  2013     7   29425
 8  2013     8   29327
 9  2013     9   27574
10  2013    10   28889
11  2013    11   27268
12  2013    12   28135
> #total flights per year
> (per_year <- per_month %>% 
+     summarize(flights=sum(flights)))
# A tibble: 1 x 2
   year flights
  <int>   <int>
1  2013  336776

Ungrouping

Grouping can be removed with ungroup()

> daily %>% ungroup() %>% 
+   summarize(flights=n())
# A tibble: 1 x 1
  flights
    <int>
1  336776

Grouping and Summarizing - Exercises

  1. Brainstorm at least five different ways to assess the typical delay characteristics of a group of flights. Consider the following scenarios:
  • A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time.

  • A flight is always 10 minutes late.

  • A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time.

  • 99% of the time a flight is on time. 1% of the time it’s 2 hours late.

Which is more important: arrival delay or departure delay?

  • Arrival delay is more important. Sometimes a departure delay could be reduced during the flight time.
  • Below are 5 ways of evaluating delay characteristics.
> #1. destinations with average arrival delay <=-30 minutes (30 min early) 
> #>10% of the time.
> flights %>% group_by(dest) %>% summarize(n=n(),
+   min_perc=mean(arr_delay<=-30, na.rm=TRUE)) %>% 
+   filter(min_perc>0.10) %>% 
+   arrange(desc(min_perc))
# A tibble: 27 x 3
   dest      n min_perc
   <chr> <int>    <dbl>
 1 ANC       8    0.25 
 2 SNA     825    0.208
 3 ILM     110    0.178
 4 SAT     686    0.178
 5 LGB     668    0.174
 6 ABQ     254    0.173
 7 SEA    3923    0.171
 8 OAK     312    0.165
 9 HNL     707    0.164
10 SFO   13331    0.161
# ... with 17 more rows
> #2. destinations with average arrival delay >=30 minutes (30 min late) 
> #>25% of the time.
> flights %>% group_by(dest) %>% summarize(n=n(),
+   min_perc=mean(arr_delay>=30, na.rm=TRUE)) %>% 
+   filter(min_perc>0.25) %>% 
+   arrange(desc(min_perc))
# A tibble: 12 x 3
   dest      n min_perc
   <chr> <int>    <dbl>
 1 CAE     116    0.481
 2 OKC     346    0.390
 3 TUL     315    0.384
 4 JAC      25    0.381
 5 TYS     631    0.308
 6 DSM     569    0.275
 7 BHM     297    0.275
 8 MSN     572    0.273
 9 RIC    2454    0.264
10 CRW     138    0.261
11 SMF     284    0.259
12 GSP     849    0.254
> #3. destinations with highest standard deviation of arrival delay
> flights %>% group_by(dest) %>% summarize(n=n(),
+   arr_delay_sd=sd(arr_delay, na.rm=TRUE)) %>% 
+   arrange(desc(arr_delay_sd))
# A tibble: 105 x 3
   dest      n arr_delay_sd
   <chr> <int>        <dbl>
 1 HNL     707         60.8
 2 TUL     315         60.3
 3 TVC     101         59.3
 4 CAK     864         58.3
 5 TYS     631         58.0
 6 SAT     686         57.8
 7 DSM     569         56.9
 8 MSN     572         56.8
 9 BHM     297         56.2
10 CVG    3941         55.1
# ... with 95 more rows
> #4 average departure delay by airport
> flights %>% group_by(origin) %>% summarize(n=n(),
+   dep_delay_mean=mean(dep_delay, na.rm=TRUE)) 
# A tibble: 3 x 3
  origin      n dep_delay_mean
  <chr>   <int>          <dbl>
1 EWR    120835           15.1
2 JFK    111279           12.1
3 LGA    104662           10.3
> #5 Planes with greatest average departure delay
> flights %>% group_by(tailnum) %>% summarize(n=n(),
+ dep_delay_mean=mean(dep_delay, na.rm=TRUE)) %>% 
+   filter(n>5) %>% 
+   arrange(desc(dep_delay_mean))
# A tibble: 3,590 x 3
   tailnum     n dep_delay_mean
   <chr>   <int>          <dbl>
 1 N665MQ      6          177  
 2 N276AT      6           84.8
 3 N652SW      6           79.5
 4 N919FJ      6           78  
 5 N396SW      7           69.7
 6 N354AT     10           65.1
 7 N337AT     13           63.2
 8 N388SW      7           59.3
 9 N261AT      9           59  
10 N550NW      7           58.6
# ... with 3,580 more rows
  1. Come up with another approach that will give you the same output as not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum,wt=distance) (without using count()).
> not_cancelled <- flights %>% 
+   filter(!is.na(dep_delay),!is.na(arr_delay))
> 
> (x <- not_cancelled %>% count(dest))
# A tibble: 104 x 2
   dest      n
   <chr> <int>
 1 ABQ     254
 2 ACK     264
 3 ALB     418
 4 ANC       8
 5 ATL   16837
 6 AUS    2411
 7 AVL     261
 8 BDL     412
 9 BGR     358
10 BHM     269
# ... with 94 more rows
> (y <- not_cancelled %>% group_by(dest) 
+ %>% summarize(n=max(min_rank(dep_delay))))
# A tibble: 104 x 2
   dest      n
   <chr> <int>
 1 ABQ     254
 2 ACK     264
 3 ALB     418
 4 ANC       8
 5 ATL   16837
 6 AUS    2411
 7 AVL     261
 8 BDL     412
 9 BGR     358
10 BHM     269
# ... with 94 more rows
> identical(x,y)
[1] TRUE
> (x <- not_cancelled %>% count(tailnum, wt=distance))
# A tibble: 4,037 x 2
   tailnum      n
   <chr>    <dbl>
 1 D942DN    3418
 2 N0EGMQ  239143
 3 N10156  109664
 4 N102UW   25722
 5 N103US   24619
 6 N104UW   24616
 7 N10575  139903
 8 N105UW   23618
 9 N107US   21677
10 N108UW   32070
# ... with 4,027 more rows
> (y <- not_cancelled %>% group_by(tailnum) 
+   %>% summarize(n=sum(distance)))
# A tibble: 4,037 x 2
   tailnum      n
   <chr>    <dbl>
 1 D942DN    3418
 2 N0EGMQ  239143
 3 N10156  109664
 4 N102UW   25722
 5 N103US   24619
 6 N104UW   24616
 7 N10575  139903
 8 N105UW   23618
 9 N107US   21677
10 N108UW   32070
# ... with 4,027 more rows
> identical(x,y)
[1] TRUE
  1. Our definition of cancelled flights (is.na(dep_delay) | is.na(arr_delay) is suboptimal. Why? Which is the most important column?
  • Some flights departed with no arr_delay recorded
  • A better indicator is dep_time
> #Some flights depart with no air time recorded
> flights %>% filter(is.na(air_time)) %>% 
+   select(flight, dep_time, arr_time, dep_delay, arr_delay, air_time)
# A tibble: 9,430 x 6
   flight dep_time arr_time dep_delay arr_delay air_time
    <int>    <int>    <int>     <dbl>     <dbl>    <dbl>
 1   4525     1525     1934        -5        NA       NA
 2   3806     1528     2002        29        NA       NA
 3   4413     1740     2158        -5        NA       NA
 4   1228     1807     2251        29        NA       NA
 5   3325     1939       29        59        NA       NA
 6   4333     1952     2358        22        NA       NA
 7   4204     2016       NA        46        NA       NA
 8   4308       NA       NA        NA        NA       NA
 9    791       NA       NA        NA        NA       NA
10   1925       NA       NA        NA        NA       NA
# ... with 9,420 more rows
> #Flights that did not depart
> flights %>% filter(is.na(dep_time)) %>% 
+   select(flight, dep_time, arr_time, dep_delay, arr_delay, air_time)
# A tibble: 8,255 x 6
   flight dep_time arr_time dep_delay arr_delay air_time
    <int>    <int>    <int>     <dbl>     <dbl>    <dbl>
 1   4308       NA       NA        NA        NA       NA
 2    791       NA       NA        NA        NA       NA
 3   1925       NA       NA        NA        NA       NA
 4    125       NA       NA        NA        NA       NA
 5   4352       NA       NA        NA        NA       NA
 6   4406       NA       NA        NA        NA       NA
 7   4434       NA       NA        NA        NA       NA
 8   4935       NA       NA        NA        NA       NA
 9   3849       NA       NA        NA        NA       NA
10    133       NA       NA        NA        NA       NA
# ... with 8,245 more rows
  1. Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay?
  • The number of cancelled flights tends to increase with the number of total flights.
  • The proportion of cancelled flights tends to increase with average delay.
> (cancelled_per_day <- flights %>%
+   mutate(cancelled = is.na(dep_time)) %>%
+   group_by(year, month, day) %>%
+   summarize(cancelled_num = sum(cancelled),flights_num = n()))
# A tibble: 365 x 5
# Groups:   year, month [12]
    year month   day cancelled_num flights_num
   <int> <int> <int>         <int>       <int>
 1  2013     1     1             4         842
 2  2013     1     2             8         943
 3  2013     1     3            10         914
 4  2013     1     4             6         915
 5  2013     1     5             3         720
 6  2013     1     6             1         832
 7  2013     1     7             3         933
 8  2013     1     8             4         899
 9  2013     1     9             5         902
10  2013     1    10             3         932
# ... with 355 more rows
> ggplot(cancelled_per_day) +
+   geom_point(aes(x = flights_num, y = cancelled_num), color="firebrick")+
+   theme_light()

> (cancelled_and_delays <- flights %>%
+   mutate(cancelled = is.na(dep_time)) %>%
+   group_by(year, month, day) %>%
+   summarize(cancelled_prop = mean(cancelled),
+     avg_dep_delay = mean(dep_delay, na.rm = TRUE),
+     avg_arr_delay = mean(arr_delay, na.rm = TRUE)))
# A tibble: 365 x 6
# Groups:   year, month [12]
    year month   day cancelled_prop avg_dep_delay avg_arr_delay
   <int> <int> <int>          <dbl>         <dbl>         <dbl>
 1  2013     1     1        0.00475         11.5         12.7  
 2  2013     1     2        0.00848         13.9         12.7  
 3  2013     1     3        0.0109          11.0          5.73 
 4  2013     1     4        0.00656          8.95        -1.93 
 5  2013     1     5        0.00417          5.73        -1.53 
 6  2013     1     6        0.00120          7.15         4.24 
 7  2013     1     7        0.00322          5.42        -4.95 
 8  2013     1     8        0.00445          2.55        -3.23 
 9  2013     1     9        0.00554          2.28        -0.264
10  2013     1    10        0.00322          2.84        -5.90 
# ... with 355 more rows
> ggplot(cancelled_and_delays) +
+   geom_point(aes(x = avg_dep_delay, y = cancelled_prop), color="firebrick")+
+   theme_light()

  1. Which carrier has the worst delays?
> flights %>%
+   group_by(carrier) %>%
+   summarize(arr_delay_mean = mean(arr_delay, na.rm = TRUE)) %>%
+   arrange(desc(arr_delay_mean)) %>% left_join(airlines)
# A tibble: 16 x 3
   carrier arr_delay_mean name                       
   <chr>            <dbl> <chr>                      
 1 F9              21.9   Frontier Airlines Inc.     
 2 FL              20.1   AirTran Airways Corporation
 3 EV              15.8   ExpressJet Airlines Inc.   
 4 YV              15.6   Mesa Airlines Inc.         
 5 OO              11.9   SkyWest Airlines Inc.      
 6 MQ              10.8   Envoy Air                  
 7 WN               9.65  Southwest Airlines Co.     
 8 B6               9.46  JetBlue Airways            
 9 9E               7.38  Endeavor Air Inc.          
10 UA               3.56  United Air Lines Inc.      
11 US               2.13  US Airways Inc.            
12 VX               1.76  Virgin America             
13 DL               1.64  Delta Air Lines Inc.       
14 AA               0.364 American Airlines Inc.     
15 HA              -6.92  Hawaiian Airlines Inc.     
16 AS              -9.93  Alaska Airlines Inc.       
  1. For each plane, count the number of flights before the first delay of greater than 1 hour.
> flights %>%
+   select(tailnum, year, month,day, arr_delay) %>%
+   filter(!is.na(arr_delay)) %>%
+   #sort by plane and then day
+   arrange(tailnum, year, month, day) %>%
+   group_by(tailnum) %>%
+   # cumulative number of flights delayed over one hour
+   mutate(cum_hr_delays = cumsum(arr_delay > 60)) %>%
+   # count the number of flights == 0
+   summarize(total_flights = sum(cum_hr_delays < 1)) %>%
+   arrange(desc(total_flights))
# A tibble: 4,037 x 2
   tailnum total_flights
   <chr>           <int>
 1 N717TW            119
 2 N705TW             97
 3 N765US             97
 4 N3758Y             96
 5 N12125             94
 6 N320AA             94
 7 N13110             91
 8 N744P              90
 9 N37456             88
10 N709TW             88
# ... with 4,027 more rows
  1. What does the sort argument to count() do? When might you use it?.
  • It sorts the results by n(), descending. It is helpful when you’d like to see the largest group first.
> flights %>%
+   count(dest, sort = TRUE)
# A tibble: 105 x 2
   dest      n
   <chr> <int>
 1 ORD   17283
 2 ATL   17215
 3 LAX   16174
 4 BOS   15508
 5 MCO   14082
 6 CLT   14064
 7 SFO   13331
 8 FLL   12055
 9 MIA   11728
10 DCA    9705
# ... with 95 more rows

Summarize Across Columns


The recently launched dplyr 1.0.0 includes a new across() function which allows you to perform the same operation across many columns. See the article:

https://www.tidyverse.org/blog/2020/04/dplyr-1-0-0-colwise/

> starwars %>% 
+   group_by(species) %>% 
+   filter(n() > 1) %>% 
+   summarize(across(c(sex, gender, homeworld), n_distinct))
# A tibble: 9 x 4
  species    sex gender homeworld
  <chr>    <int>  <int>     <int>
1 Droid        1      2         3
2 Gungan       1      1         1
3 Human        2      2        16
4 Kaminoan     2      2         1
5 Mirialan     1      1         1
6 Twi'lek      2      2         1
7 Wookiee      1      1         1
8 Zabrak       1      1         2
9 <NA>         1      1         3
> starwars %>% 
+   group_by(homeworld) %>% 
+   filter(n() > 1) %>% 
+   summarize(across(where(is.numeric), mean, na.rm = TRUE), n = n())
# A tibble: 10 x 5
   homeworld height  mass birth_year     n
   <chr>      <dbl> <dbl>      <dbl> <int>
 1 Alderaan    176.  64         43       3
 2 Corellia    175   78.5       25       2
 3 Coruscant   174.  50         91       3
 4 Kamino      208.  83.1       31.5     3
 5 Kashyyyk    231  124        200       2
 6 Mirial      168   53.1       49       2
 7 Naboo       175.  64.2       55      11
 8 Ryloth      179   55         48       2
 9 Tatooine    170.  85.4       54.6    10
10 <NA>        139.  82        334.     10

Grouped Mutates and Filters


In addition to summarize() you can also use mutate() and filter() with groups.

> #9 highest arr_delay by day
> flights_sml %>% 
+   group_by(year,month,day) %>% 
+   filter(rank(desc(arr_delay))<10)
# A tibble: 3,306 x 7
# Groups:   year, month, day [365]
    year month   day dep_delay arr_delay distance air_time
   <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
 1  2013     1     1       853       851      184       41
 2  2013     1     1       290       338     1134      213
 3  2013     1     1       260       263      266       46
 4  2013     1     1       157       174      213       60
 5  2013     1     1       216       222      708      121
 6  2013     1     1       255       250      589      115
 7  2013     1     1       285       246     1085      146
 8  2013     1     1       192       191      199       44
 9  2013     1     1       379       456     1092      222
10  2013     1     2       224       207      550       94
# ... with 3,296 more rows
> # Only destinations with >10,000 flights
> (popular_dests <- flights %>% 
+   group_by(dest) %>% 
+   filter(n()>10000))
# A tibble: 131,440 x 19
# Groups:   dest [9]
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      542            540         2      923            850
 2  2013     1     1      554            600        -6      812            837
 3  2013     1     1      554            558        -4      740            728
 4  2013     1     1      555            600        -5      913            854
 5  2013     1     1      557            600        -3      838            846
 6  2013     1     1      558            600        -2      753            745
 7  2013     1     1      558            600        -2      924            917
 8  2013     1     1      558            600        -2      923            937
 9  2013     1     1      559            559         0      702            706
10  2013     1     1      600            600         0      851            858
# ... with 131,430 more rows, and 11 more variables: arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
> #proportion delay by destination
> popular_dests %>% 
+   filter(arr_delay>0) %>% 
+   mutate(prop_delay = arr_delay/sum(arr_delay)) %>% 
+   select(year:day, dest, arr_delay, prop_delay)
# A tibble: 50,245 x 6
# Groups:   dest [9]
    year month   day dest  arr_delay prop_delay
   <int> <int> <int> <chr>     <dbl>      <dbl>
 1  2013     1     1 MIA          33  0.000235 
 2  2013     1     1 ORD          12  0.0000424
 3  2013     1     1 FLL          19  0.0000938
 4  2013     1     1 ORD           8  0.0000283
 5  2013     1     1 LAX           7  0.0000344
 6  2013     1     1 ATL          12  0.0000400
 7  2013     1     1 ORD          32  0.000113 
 8  2013     1     1 SFO          14  0.0000682
 9  2013     1     1 MIA           5  0.0000356
10  2013     1     1 LAX          29  0.000143 
# ... with 50,235 more rows

Grouping by Multiple Variables - Exercises

  1. Which plane (tailnum) has the worst on-time record?
  • N203FR, based on average and median arrival delay.
> worst_time <- flights %>% group_by(tailnum) %>% 
+   summarize(avg_arr_delay=mean(arr_delay, na.rm=TRUE), 
+             med_arr_delay=median(arr_delay, na.rm=TRUE),
+             n=n()) %>% 
+   filter(n>20)  
>   
> #top 5 median arr_delay
> worst_time %>% arrange(desc(med_arr_delay)) %>% head(5)
# A tibble: 5 x 4
  tailnum avg_arr_delay med_arr_delay     n
  <chr>           <dbl>         <dbl> <int>
1 N203FR           59.1            21    41
2 N988AT           44.3            21    37
3 N353AT           41.2            17    21
4 N731SA           27.4            17    25
5 N980AT           35.2            17    47
> #top 5 avg arr_delay
> worst_time %>% arrange(desc(avg_arr_delay)) %>% head(5)
# A tibble: 5 x 4
  tailnum avg_arr_delay med_arr_delay     n
  <chr>           <dbl>         <dbl> <int>
1 N203FR           59.1          21      41
2 N645MQ           51             6.5    25
3 N956AT           47.6           5.5    36
4 N988AT           44.3          21      37
5 N521VA           42.2          -8      27
  1. Which time of day should you fly if you want to avoid delays as much as possible?
  • The morning hours seem best.
> flights %>%
+   group_by(hour) %>%
+   summarize(avg_arr_delay = mean(arr_delay, na.rm = TRUE),
+             med_arr_delay = median(arr_delay, na.rm = TRUE)) %>%
+   arrange(avg_arr_delay) %>% head(5)
# A tibble: 5 x 3
   hour avg_arr_delay med_arr_delay
  <dbl>         <dbl>         <dbl>
1     7         -5.30           -10
2     5         -4.80            -8
3     6         -3.38            -8
4     9         -1.45            -8
5     8         -1.11            -8
  1. For each destination, compute the total minutes of delay. For each flight, compute the proportion of the total delay for its destination.
> flightsx <- flights %>% filter(!is.na(arr_delay),arr_delay>0) %>% 
+   group_by(dest) %>%
+   mutate(tot_arr_delay = sum(arr_delay),
+             prop_arr_delay=arr_delay/tot_arr_delay) %>% 
+   select(dest,flight,tot_arr_delay,prop_arr_delay)
> 
> flightsx %>% group_by(dest,flight) %>% 
+   summarize(tot_prop=sum(prop_arr_delay))
# A tibble: 8,505 x 3
# Groups:   dest [103]
   dest  flight tot_prop
   <chr>  <int>    <dbl>
 1 ABQ       65 0.433   
 2 ABQ     1505 0.567   
 3 ACK     1191 0.475   
 4 ACK     1195 0.0208  
 5 ACK     1291 0.0898  
 6 ACK     1491 0.414   
 7 ALB     3260 0.0116  
 8 ALB     3264 0.000418
 9 ALB     3811 0.0625  
10 ALB     3817 0.0205  
# ... with 8,495 more rows
  1. Delays are typically correlated: even when the problem that caused the initial delay has been resolved, later flights are delayed to allow earlier flights to leave. Using lag() explore how the delay of a flight is related to the delay of the immediately preceding flight.
> lagged_delays <- flights %>%
+   arrange(origin, month, day, dep_time) %>%
+   group_by(origin) %>%
+   mutate(dep_delay_lag = lag(dep_delay)) %>%
+   filter(!is.na(dep_delay), !is.na(dep_delay_lag))
> 
> lagged_delays %>% select(origin, month, day, 
+                          dep_time, dep_delay,dep_delay_lag)
# A tibble: 327,649 x 6
# Groups:   origin [3]
   origin month   day dep_time dep_delay dep_delay_lag
   <chr>  <int> <int>    <int>     <dbl>         <dbl>
 1 EWR        1     1      554        -4             2
 2 EWR        1     1      555        -5            -4
 3 EWR        1     1      558        -2            -5
 4 EWR        1     1      559        -1            -2
 5 EWR        1     1      601         1            -1
 6 EWR        1     1      606        -4             1
 7 EWR        1     1      607         0            -4
 8 EWR        1     1      608         8             0
 9 EWR        1     1      615         0             8
10 EWR        1     1      622        -8             0
# ... with 327,639 more rows
> lagged_delays %>% group_by(dep_delay_lag) %>% 
+   summarize(mean_dep_delay=mean(dep_delay)) %>% 
+   ggplot(mapping=aes(y=mean_dep_delay,x=dep_delay_lag))+
+   geom_point(color="firebrick")+
+   scale_x_continuous(breaks = seq(0, 1500, by = 200)) +
+   labs(y = "Departure Delay", x = "Previous Departure Delay")

  1. Look at each destination. Can you find flights that are suspiciously fast?
  • I looked at outliers based on the number of standard deviations.
  • The flight to ATL at over 700 mph was the greatest outlier.
> (flightsy <- flights %>% filter(!is.na(air_time)) %>% 
+   mutate(speed=distance/air_time*60) %>% 
+   group_by(dest,flight) %>% 
+   mutate(avg_speed=mean(speed),max_speed=max(speed), sd_speed=sd(speed),
+          num_sd=(speed-avg_speed)/sd_speed) %>% 
+   select(dest,flight,avg_speed,max_speed,sd_speed,num_sd) %>% 
+   arrange(desc(num_sd)))
# A tibble: 327,346 x 6
# Groups:   dest, flight [11,421]
   dest  flight avg_speed max_speed sd_speed num_sd
   <chr>  <int>     <dbl>     <dbl>    <dbl>  <dbl>
 1 ATL     1499      407.      703.     36.9   8.03
 2 MSP     4667      396.      650.     33.4   7.61
 3 BOS     2132      300.      526.     36.6   6.17
 4 CVG     4687      356.      551.     32.7   5.96
 5 BOS     2142      297.      480      32.1   5.71
 6 PIT     5486      344.      502.     28.6   5.53
 7 BUF     2002      326.      475.     28.1   5.30
 8 PBI     1902      425.      591.     32.1   5.19
 9 ROC       30      309.      453.     27.8   5.18
10 BNA     3805      391.      641.     48.7   5.14
# ... with 327,336 more rows
> #plot the number of standard deviations
> ggplot(flightsy, aes(x=num_sd))+
+   geom_histogram(binwidth=0.1, fill="firebrick")

> #zoom in
> ggplot(flightsy, aes(x=num_sd))+
+   geom_histogram(binwidth=0.1, fill="firebrick")+
+   coord_cartesian(ylim=c(0,5))