Wk 3 Assignment: Data Transformation from the Hands-on Programming with R
title: "Untitled" author: "Suma Pendyala" date: "6/7/2020" output: html_document
5.1.1 Prerequisites
library(nycflights13) library(tidyverse)
5.1.2 nycflights13
flights
## # A tibble: 336,776 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 517 515 2 830 819 ## 2 2013 1 1 533 529 4 850 830 ## 3 2013 1 1 542 540 2 923 850 ## 4 2013 1 1 544 545 -1 1004 1022 ## 5 2013 1 1 554 600 -6 812 837 ## 6 2013 1 1 554 558 -4 740 728 ## 7 2013 1 1 555 600 -5 913 854 ## 8 2013 1 1 557 600 -3 709 723 ## 9 2013 1 1 557 600 -3 838 846 ## 10 2013 1 1 558 600 -2 753 745 ## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#> # A tibble: 336,776 x 19 #> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time #> <int> <int> <int> <int> <int> <dbl> <int> <int> #> 1 2013 1 1 517 515 2 830 819 #> 2 2013 1 1 533 529 4 850 830 #> 3 2013 1 1 542 540 2 923 850 #> 4 2013 1 1 544 545 -1 1004 1022 #> 5 2013 1 1 554 600 -6 812 837 #> 6 2013 1 1 554 558 -4 740 728 #> # . with 336,770 more rows, and 11 more variables: arr_delay <dbl>, #> # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, #> # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
5.2 Filter rows with filter()
filter(flights, month == 1, day == 1)
## # A tibble: 842 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 517 515 2 830 819 ## 2 2013 1 1 533 529 4 850 830 ## 3 2013 1 1 542 540 2 923 850 ## 4 2013 1 1 544 545 -1 1004 1022 ## 5 2013 1 1 554 600 -6 812 837 ## 6 2013 1 1 554 558 -4 740 728 ## 7 2013 1 1 555 600 -5 913 854 ## 8 2013 1 1 557 600 -3 709 723 ## 9 2013 1 1 557 600 -3 838 846 ## 10 2013 1 1 558 600 -2 753 745 ## # ... with 832 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#> # A tibble: 842 x 19 #> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time #> <int> <int> <int> <int> <int> <dbl> <int> <int> #> 1 2013 1 1 517 515 2 830 819 #> 2 2013 1 1 533 529 4 850 830 #> 3 2013 1 1 542 540 2 923 850 #> 4 2013 1 1 544 545 -1 1004 1022 #> 5 2013 1 1 554 600 -6 812 837 #> 6 2013 1 1 554 558 -4 740 728 #> # . with 836 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>, #> # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
jan1 <- filter(flights, month == 1, day == 1)
(dec25 <- filter(flights, month == 12, day == 25))
## # A tibble: 719 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 12 25 456 500 -4 649 651 ## 2 2013 12 25 524 515 9 805 814 ## 3 2013 12 25 542 540 2 832 850 ## 4 2013 12 25 546 550 -4 1022 1027 ## 5 2013 12 25 556 600 -4 730 745 ## 6 2013 12 25 557 600 -3 743 752 ## 7 2013 12 25 557 600 -3 818 831 ## 8 2013 12 25 559 600 -1 855 856 ## 9 2013 12 25 559 600 -1 849 855 ## 10 2013 12 25 600 600 0 850 846 ## # ... with 709 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#> # A tibble: 719 x 19 #> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time #> <int> <int> <int> <int> <int> <dbl> <int> <int> #> 1 2013 12 25 456 500 -4 649 651 #> 2 2013 12 25 524 515 9 805 814 #> 3 2013 12 25 542 540 2 832 850 #> 4 2013 12 25 546 550 -4 1022 1027 #> 5 2013 12 25 556 600 -4 730 745 #> 6 2013 12 25 557 600 -3 743 752 #> # . with 713 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>, #> # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, #> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
5.2.1 Comparisons
filter(flights, month == 1)
## # A tibble: 27,004 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 517 515 2 830 819 ## 2 2013 1 1 533 529 4 850 830 ## 3 2013 1 1 542 540 2 923 850 ## 4 2013 1 1 544 545 -1 1004 1022 ## 5 2013 1 1 554 600 -6 812 837 ## 6 2013 1 1 554 558 -4 740 728 ## 7 2013 1 1 555 600 -5 913 854 ## 8 2013 1 1 557 600 -3 709 723 ## 9 2013 1 1 557 600 -3 838 846 ## 10 2013 1 1 558 600 -2 753 745 ## # ... with 26,994 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
sqrt(2) ^ 2 == 2
## [1] FALSE
#> [1] FALSE 1 / 49 * 49 == 1
## [1] FALSE
#> [1] FALSE
near(sqrt(2) ^ 2, 2)
## [1] TRUE
#> [1] TRUE near(1 / 49 * 49, 1)
## [1] TRUE
#> [1] TRUE
5.2.2 Logical operators
filter(flights, month == 11 | month == 12)
## # A tibble: 55,403 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 11 1 5 2359 6 352 345 ## 2 2013 11 1 35 2250 105 123 2356 ## 3 2013 11 1 455 500 -5 641 651 ## 4 2013 11 1 539 545 -6 856 827 ## 5 2013 11 1 542 545 -3 831 855 ## 6 2013 11 1 549 600 -11 912 923 ## 7 2013 11 1 550 600 -10 705 659 ## 8 2013 11 1 554 600 -6 659 701 ## 9 2013 11 1 554 600 -6 826 827 ## 10 2013 11 1 554 600 -6 749 751 ## # ... with 55,393 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
nov_dec <- filter(flights, month %in% c(11, 12))
filter(flights, !(arr_delay > 120 | dep_delay > 120))
## # A tibble: 316,050 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 517 515 2 830 819 ## 2 2013 1 1 533 529 4 850 830 ## 3 2013 1 1 542 540 2 923 850 ## 4 2013 1 1 544 545 -1 1004 1022 ## 5 2013 1 1 554 600 -6 812 837 ## 6 2013 1 1 554 558 -4 740 728 ## 7 2013 1 1 555 600 -5 913 854 ## 8 2013 1 1 557 600 -3 709 723 ## 9 2013 1 1 557 600 -3 838 846 ## 10 2013 1 1 558 600 -2 753 745 ## # ... with 316,040 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, arr_delay <= 120, dep_delay <= 120)
## # A tibble: 316,050 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 517 515 2 830 819 ## 2 2013 1 1 533 529 4 850 830 ## 3 2013 1 1 542 540 2 923 850 ## 4 2013 1 1 544 545 -1 1004 1022 ## 5 2013 1 1 554 600 -6 812 837 ## 6 2013 1 1 554 558 -4 740 728 ## 7 2013 1 1 555 600 -5 913 854 ## 8 2013 1 1 557 600 -3 709 723 ## 9 2013 1 1 557 600 -3 838 846 ## 10 2013 1 1 558 600 -2 753 745 ## # ... with 316,040 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
5.2.3 Missing values
NA > 5
## [1] NA
#> [1] NA 10 == NA
## [1] NA
#> [1] NA NA + 10
## [1] NA
#> [1] NA NA / 2
## [1] NA
#> [1] NA
# Let x be Mary's age. We don't know how old she is. x <- NA # Let y be John's age. We don't know how old he is. y <- NA # Are John and Mary the same age? x == y
## [1] NA
#> [1] NA # We don't know! is.na(x)
## [1] TRUE
#> [1] TRUE
df <- tibble(x = c(1, NA, 3)) filter(df, x > 1)
## # A tibble: 1 x 1 ## x ## <dbl> ## 1 3
#> # A tibble: 1 x 1 #> x #> <dbl> #> 1 3 filter(df, is.na(x) | x > 1)
## # A tibble: 2 x 1 ## x ## <dbl> ## 1 NA ## 2 3
#> # A tibble: 2 x 1 #> x #> <dbl> #> 1 NA #> 2 3
5.3 Arrange rows with arrange()
arrange(flights, year, month, day)
## # A tibble: 336,776 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 517 515 2 830 819 ## 2 2013 1 1 533 529 4 850 830 ## 3 2013 1 1 542 540 2 923 850 ## 4 2013 1 1 544 545 -1 1004 1022 ## 5 2013 1 1 554 600 -6 812 837 ## 6 2013 1 1 554 558 -4 740 728 ## 7 2013 1 1 555 600 -5 913 854 ## 8 2013 1 1 557 600 -3 709 723 ## 9 2013 1 1 557 600 -3 838 846 ## 10 2013 1 1 558 600 -2 753 745 ## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#> # A tibble: 336,776 x 19 #> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time #> <int> <int> <int> <int> <int> <dbl> <int> <int> #> 1 2013 1 1 517 515 2 830 819 #> 2 2013 1 1 533 529 4 850 830 #> 3 2013 1 1 542 540 2 923 850 #> 4 2013 1 1 544 545 -1 1004 1022 #> 5 2013 1 1 554 600 -6 812 837 #> 6 2013 1 1 554 558 -4 740 728 #> # . with 336,770 more rows, and 11 more variables: arr_delay <dbl>, #> # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, #> # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 9 641 900 1301 1242 1530 ## 2 2013 6 15 1432 1935 1137 1607 2120 ## 3 2013 1 10 1121 1635 1126 1239 1810 ## 4 2013 9 20 1139 1845 1014 1457 2210 ## 5 2013 7 22 845 1600 1005 1044 1815 ## 6 2013 4 10 1100 1900 960 1342 2211 ## 7 2013 3 17 2321 810 911 135 1020 ## 8 2013 6 27 959 1900 899 1236 2226 ## 9 2013 7 22 2257 759 898 121 1026 ## 10 2013 12 5 756 1700 896 1058 2020 ## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#> # A tibble: 336,776 x 19 #> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time #> <int> <int> <int> <int> <int> <dbl> <int> <int> #> 1 2013 1 9 641 900 1301 1242 1530 #> 2 2013 6 15 1432 1935 1137 1607 2120 #> 3 2013 1 10 1121 1635 1126 1239 1810 #> 4 2013 9 20 1139 1845 1014 1457 2210 #> 5 2013 7 22 845 1600 1005 1044 1815 #> 6 2013 4 10 1100 1900 960 1342 2211 #> # . with 336,770 more rows, and 11 more variables: arr_delay <dbl>, #> # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, #> # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
df <- tibble(x = c(5, 2, NA)) arrange(df, x)
## # A tibble: 3 x 1 ## x ## <dbl> ## 1 2 ## 2 5 ## 3 NA
#> # A tibble: 3 x 1 #> x #> <dbl> #> 1 2 #> 2 5 #> 3 NA arrange(df, desc(x))
## # A tibble: 3 x 1 ## x ## <dbl> ## 1 5 ## 2 2 ## 3 NA
#> # A tibble: 3 x 1 #> x #> <dbl> #> 1 5 #> 2 2 #> 3 NA
5.4 Select columns with select()
# Select columns by name select(flights, year, month, day)
## # A tibble: 336,776 x 3 ## year month day ## <int> <int> <int> ## 1 2013 1 1 ## 2 2013 1 1 ## 3 2013 1 1 ## 4 2013 1 1 ## 5 2013 1 1 ## 6 2013 1 1 ## 7 2013 1 1 ## 8 2013 1 1 ## 9 2013 1 1 ## 10 2013 1 1 ## # ... with 336,766 more rows
#> # A tibble: 336,776 x 3 #> year month day #> <int> <int> <int> #> 1 2013 1 1 #> 2 2013 1 1 #> 3 2013 1 1 #> 4 2013 1 1 #> 5 2013 1 1 #> 6 2013 1 1 #> # . with 336,770 more rows # Select all columns between year and day (inclusive) select(flights, year:day)
## # A tibble: 336,776 x 3 ## year month day ## <int> <int> <int> ## 1 2013 1 1 ## 2 2013 1 1 ## 3 2013 1 1 ## 4 2013 1 1 ## 5 2013 1 1 ## 6 2013 1 1 ## 7 2013 1 1 ## 8 2013 1 1 ## 9 2013 1 1 ## 10 2013 1 1 ## # ... with 336,766 more rows
#> # A tibble: 336,776 x 3 #> year month day #> <int> <int> <int> #> 1 2013 1 1 #> 2 2013 1 1 #> 3 2013 1 1 #> 4 2013 1 1 #> 5 2013 1 1 #> 6 2013 1 1 #> # . with 336,770 more rows # Select all columns except those from year to day (inclusive) select(flights, -(year:day))
## # A tibble: 336,776 x 16 ## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier ## <int> <int> <dbl> <int> <int> <dbl> <chr> ## 1 517 515 2 830 819 11 UA ## 2 533 529 4 850 830 20 UA ## 3 542 540 2 923 850 33 AA ## 4 544 545 -1 1004 1022 -18 B6 ## 5 554 600 -6 812 837 -25 DL ## 6 554 558 -4 740 728 12 UA ## 7 555 600 -5 913 854 19 B6 ## 8 557 600 -3 709 723 -14 EV ## 9 557 600 -3 838 846 -8 B6 ## 10 558 600 -2 753 745 8 AA ## # ... with 336,766 more rows, and 9 more variables: flight <int>, ## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, ## # hour <dbl>, minute <dbl>, time_hour <dttm>
#> # A tibble: 336,776 x 16 #> dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier #> <int> <int> <dbl> <int> <int> <dbl> <chr> #> 1 517 515 2 830 819 11 UA #> 2 533 529 4 850 830 20 UA #> 3 542 540 2 923 850 33 AA #> 4 544 545 -1 1004 1022 -18 B6 #> 5 554 600 -6 812 837 -25 DL #> 6 554 558 -4 740 728 12 UA #> # . with 336,770 more rows, and 9 more variables: flight <int>, tailnum <chr>, #> # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, #> # minute <dbl>, time_hour <dttm>
rename(flights, tail_num = tailnum)
## # A tibble: 336,776 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 517 515 2 830 819 ## 2 2013 1 1 533 529 4 850 830 ## 3 2013 1 1 542 540 2 923 850 ## 4 2013 1 1 544 545 -1 1004 1022 ## 5 2013 1 1 554 600 -6 812 837 ## 6 2013 1 1 554 558 -4 740 728 ## 7 2013 1 1 555 600 -5 913 854 ## 8 2013 1 1 557 600 -3 709 723 ## 9 2013 1 1 557 600 -3 838 846 ## 10 2013 1 1 558 600 -2 753 745 ## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tail_num <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#> # A tibble: 336,776 x 19 #> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time #> <int> <int> <int> <int> <int> <dbl> <int> <int> #> 1 2013 1 1 517 515 2 830 819 #> 2 2013 1 1 533 529 4 850 830 #> 3 2013 1 1 542 540 2 923 850 #> 4 2013 1 1 544 545 -1 1004 1022 #> 5 2013 1 1 554 600 -6 812 837 #> 6 2013 1 1 554 558 -4 740 728 #> # . with 336,770 more rows, and 11 more variables: arr_delay <dbl>, #> # carrier <chr>, flight <int>, tail_num <chr>, origin <chr>, dest <chr>, #> # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
select(flights, time_hour, air_time, everything())
## # A tibble: 336,776 x 19 ## time_hour air_time year month day dep_time sched_dep_time ## <dttm> <dbl> <int> <int> <int> <int> <int> ## 1 2013-01-01 05:00:00 227 2013 1 1 517 515 ## 2 2013-01-01 05:00:00 227 2013 1 1 533 529 ## 3 2013-01-01 05:00:00 160 2013 1 1 542 540 ## 4 2013-01-01 05:00:00 183 2013 1 1 544 545 ## 5 2013-01-01 06:00:00 116 2013 1 1 554 600 ## 6 2013-01-01 05:00:00 150 2013 1 1 554 558 ## 7 2013-01-01 06:00:00 158 2013 1 1 555 600 ## 8 2013-01-01 06:00:00 53 2013 1 1 557 600 ## 9 2013-01-01 06:00:00 140 2013 1 1 557 600 ## 10 2013-01-01 06:00:00 138 2013 1 1 558 600 ## # ... with 336,766 more rows, and 12 more variables: dep_delay <dbl>, ## # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, ## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>, ## # hour <dbl>, minute <dbl>
#> # A tibble: 336,776 x 19 #> time_hour air_time year month day dep_time sched_dep_time #> <dttm> <dbl> <int> <int> <int> <int> <int> #> 1 2013-01-01 05:00:00 227 2013 1 1 517 515 #> 2 2013-01-01 05:00:00 227 2013 1 1 533 529 #> 3 2013-01-01 05:00:00 160 2013 1 1 542 540 #> 4 2013-01-01 05:00:00 183 2013 1 1 544 545 #> 5 2013-01-01 06:00:00 116 2013 1 1 554 600 #> 6 2013-01-01 05:00:00 150 2013 1 1 554 558 #> # . with 336,770 more rows, and 12 more variables: dep_delay <dbl>, #> # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, #> # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>, #> # hour <dbl>, minute <dbl>
5.5 Add new variables with mutate()
flights_sml <- select(flights, year:day, ends_with("delay"), distance, air_time ) mutate(flights_sml, gain = dep_delay - arr_delay, speed = distance / air_time * 60 )
## # A tibble: 336,776 x 9 ## year month day dep_delay arr_delay distance air_time gain speed ## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 2013 1 1 2 11 1400 227 -9 370. ## 2 2013 1 1 4 20 1416 227 -16 374. ## 3 2013 1 1 2 33 1089 160 -31 408. ## 4 2013 1 1 -1 -18 1576 183 17 517. ## 5 2013 1 1 -6 -25 762 116 19 394. ## 6 2013 1 1 -4 12 719 150 -16 288. ## 7 2013 1 1 -5 19 1065 158 -24 404. ## 8 2013 1 1 -3 -14 229 53 11 259. ## 9 2013 1 1 -3 -8 944 140 5 405. ## 10 2013 1 1 -2 8 733 138 -10 319. ## # ... with 336,766 more rows
#> # A tibble: 336,776 x 9 #> year month day dep_delay arr_delay distance air_time gain speed #> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> #> 1 2013 1 1 2 11 1400 227 -9 370. #> 2 2013 1 1 4 20 1416 227 -16 374. #> 3 2013 1 1 2 33 1089 160 -31 408. #> 4 2013 1 1 -1 -18 1576 183 17 517. #> 5 2013 1 1 -6 -25 762 116 19 394. #> 6 2013 1 1 -4 12 719 150 -16 288. #> # . with 336,770 more rows
mutate(flights_sml, gain = dep_delay - arr_delay, hours = air_time / 60, gain_per_hour = gain / hours )
## # A tibble: 336,776 x 10 ## year month day dep_delay arr_delay distance air_time gain hours ## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 2013 1 1 2 11 1400 227 -9 3.78 ## 2 2013 1 1 4 20 1416 227 -16 3.78 ## 3 2013 1 1 2 33 1089 160 -31 2.67 ## 4 2013 1 1 -1 -18 1576 183 17 3.05 ## 5 2013 1 1 -6 -25 762 116 19 1.93 ## 6 2013 1 1 -4 12 719 150 -16 2.5 ## 7 2013 1 1 -5 19 1065 158 -24 2.63 ## 8 2013 1 1 -3 -14 229 53 11 0.883 ## 9 2013 1 1 -3 -8 944 140 5 2.33 ## 10 2013 1 1 -2 8 733 138 -10 2.3 ## # ... with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>
#> # A tibble: 336,776 x 10 #> year month day dep_delay arr_delay distance air_time gain hours #> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> #> 1 2013 1 1 2 11 1400 227 -9 3.78 #> 2 2013 1 1 4 20 1416 227 -16 3.78 #> 3 2013 1 1 2 33 1089 160 -31 2.67 #> 4 2013 1 1 -1 -18 1576 183 17 3.05 #> 5 2013 1 1 -6 -25 762 116 19 1.93 #> 6 2013 1 1 -4 12 719 150 -16 2.5 #> # . with 336,770 more rows, and 1 more variable: gain_per_hour <dbl>
transmute(flights, gain = dep_delay - arr_delay, hours = air_time / 60, gain_per_hour = gain / hours )
## # A tibble: 336,776 x 3 ## gain hours gain_per_hour ## <dbl> <dbl> <dbl> ## 1 -9 3.78 -2.38 ## 2 -16 3.78 -4.23 ## 3 -31 2.67 -11.6 ## 4 17 3.05 5.57 ## 5 19 1.93 9.83 ## 6 -16 2.5 -6.4 ## 7 -24 2.63 -9.11 ## 8 11 0.883 12.5 ## 9 5 2.33 2.14 ## 10 -10 2.3 -4.35 ## # ... with 336,766 more rows
#> # A tibble: 336,776 x 3 #> gain hours gain_per_hour #> <dbl> <dbl> <dbl> #> 1 -9 3.78 -2.38 #> 2 -16 3.78 -4.23 #> 3 -31 2.67 -11.6 #> 4 17 3.05 5.57 #> 5 19 1.93 9.83 #> 6 -16 2.5 -6.4 #> # . with 336,770 more rows
5.5.1 Useful creation functions
transmute(flights, dep_time, hour = dep_time %/% 100, minute = dep_time %% 100 )
## # A tibble: 336,776 x 3 ## dep_time hour minute ## <int> <dbl> <dbl> ## 1 517 5 17 ## 2 533 5 33 ## 3 542 5 42 ## 4 544 5 44 ## 5 554 5 54 ## 6 554 5 54 ## 7 555 5 55 ## 8 557 5 57 ## 9 557 5 57 ## 10 558 5 58 ## # ... with 336,766 more rows
#> # A tibble: 336,776 x 3 #> dep_time hour minute #> <int> <dbl> <dbl> #> 1 517 5 17 #> 2 533 5 33 #> 3 542 5 42 #> 4 544 5 44 #> 5 554 5 54 #> 6 554 5 54 #> # . with 336,770 more rows
(x <- 1:10)
## [1] 1 2 3 4 5 6 7 8 9 10
#> [1] 1 2 3 4 5 6 7 8 9 10 lag(x)
## [1] NA 1 2 3 4 5 6 7 8 9
#> [1] NA 1 2 3 4 5 6 7 8 9 lead(x)
## [1] 2 3 4 5 6 7 8 9 10 NA
#> [1] 2 3 4 5 6 7 8 9 10 NA
x
## [1] 1 2 3 4 5 6 7 8 9 10
#> [1] 1 2 3 4 5 6 7 8 9 10 cumsum(x)
## [1] 1 3 6 10 15 21 28 36 45 55
#> [1] 1 3 6 10 15 21 28 36 45 55 cummean(x)
## [1] 1.000000 1.000000 1.333333 1.750000 2.200000 2.666667 3.142857 3.625000 ## [9] 4.111111 4.600000
#> [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
y <- c(1, 2, 2, NA, 3, 4) min_rank(y)
## [1] 1 2 2 NA 4 5
#> [1] 1 2 2 NA 4 5 min_rank(desc(y))
## [1] 5 3 3 NA 2 1
#> [1] 5 3 3 NA 2 1
row_number(y)
## [1] 1 2 3 NA 4 5
#> [1] 1 2 3 NA 4 5 dense_rank(y)
## [1] 1 2 2 NA 3 4
#> [1] 1 2 2 NA 3 4 percent_rank(y)
## [1] 0.00 0.25 0.25 NA 0.75 1.00
#> [1] 0.00 0.25 0.25 NA 0.75 1.00 cume_dist(y)
## [1] 0.2 0.6 0.6 NA 0.8 1.0
#> [1] 0.2 0.6 0.6 NA 0.8 1.0
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 1 x 1 ## delay ## <dbl> ## 1 12.6
#> # A tibble: 1 x 1 #> delay #> <dbl> #> 1 12.6
by_day <- group_by(flights, year, month, day) summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 365 x 4 ## # Groups: year, month [12] ## year month day delay ## <int> <int> <int> <dbl> ## 1 2013 1 1 11.5 ## 2 2013 1 2 13.9 ## 3 2013 1 3 11.0 ## 4 2013 1 4 8.95 ## 5 2013 1 5 5.73 ## 6 2013 1 6 7.15 ## 7 2013 1 7 5.42 ## 8 2013 1 8 2.55 ## 9 2013 1 9 2.28 ## 10 2013 1 10 2.84 ## # ... with 355 more rows
#> # A tibble: 365 x 4 #> # Groups: year, month [12] #> year month day delay #> <int> <int> <int> <dbl> #> 1 2013 1 1 11.5 #> 2 2013 1 2 13.9 #> 3 2013 1 3 11.0 #> 4 2013 1 4 8.95 #> 5 2013 1 5 5.73 #> 6 2013 1 6 7.15 #> # . with 359 more rows
5.6.1 Combining multiple operations with the pipe
by_dest <- group_by(flights, dest) delay <- summarise(by_dest, count = n(), dist = mean(distance, na.rm = TRUE), delay = mean(arr_delay, na.rm = TRUE) )
delay <- filter(delay, count > 20, dest != "HNL") # It looks like delays increase with distance up to ~750 miles # and then decrease. Maybe as flights get longer there's more # ability to make up delays in the air? ggplot(data = delay, mapping = aes(x = dist, y = delay)) + geom_point(aes(size = count), alpha = 1/3) + geom_smooth(se = FALSE)

#> `geom_smooth()` using method = 'loess' and formula 'y ~ x'
delays <- flights %>% group_by(dest) %>% summarise( count = n(), dist = mean(distance, na.rm = TRUE), delay = mean(arr_delay, na.rm = TRUE) ) %>% filter(count > 20, dest != "HNL")
5.6.2 Missing values
flights %>% group_by(year, month, day) %>% summarise(mean = mean(dep_delay))
## # A tibble: 365 x 4 ## # Groups: year, month [12] ## year month day mean ## <int> <int> <int> <dbl> ## 1 2013 1 1 NA ## 2 2013 1 2 NA ## 3 2013 1 3 NA ## 4 2013 1 4 NA ## 5 2013 1 5 NA ## 6 2013 1 6 NA ## 7 2013 1 7 NA ## 8 2013 1 8 NA ## 9 2013 1 9 NA ## 10 2013 1 10 NA ## # ... with 355 more rows
#> # A tibble: 365 x 4 #> # Groups: year, month [12] #> year month day mean #> <int> <int> <int> <dbl> #> 1 2013 1 1 NA #> 2 2013 1 2 NA #> 3 2013 1 3 NA #> 4 2013 1 4 NA #> 5 2013 1 5 NA #> 6 2013 1 6 NA #> # . with 359 more rows
flights %>% group_by(year, month, day) %>% summarise(mean = mean(dep_delay, na.rm = TRUE))
## # A tibble: 365 x 4 ## # Groups: year, month [12] ## year month day mean ## <int> <int> <int> <dbl> ## 1 2013 1 1 11.5 ## 2 2013 1 2 13.9 ## 3 2013 1 3 11.0 ## 4 2013 1 4 8.95 ## 5 2013 1 5 5.73 ## 6 2013 1 6 7.15 ## 7 2013 1 7 5.42 ## 8 2013 1 8 2.55 ## 9 2013 1 9 2.28 ## 10 2013 1 10 2.84 ## # ... with 355 more rows
#> # A tibble: 365 x 4 #> # Groups: year, month [12] #> year month day mean #> <int> <int> <int> <dbl> #> 1 2013 1 1 11.5 #> 2 2013 1 2 13.9 #> 3 2013 1 3 11.0 #> 4 2013 1 4 8.95 #> 5 2013 1 5 5.73 #> 6 2013 1 6 7.15 #> # . with 359 more rows
not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay)) not_cancelled %>% group_by(year, month, day) %>% summarise(mean = mean(dep_delay))
## # A tibble: 365 x 4 ## # Groups: year, month [12] ## year month day mean ## <int> <int> <int> <dbl> ## 1 2013 1 1 11.4 ## 2 2013 1 2 13.7 ## 3 2013 1 3 10.9 ## 4 2013 1 4 8.97 ## 5 2013 1 5 5.73 ## 6 2013 1 6 7.15 ## 7 2013 1 7 5.42 ## 8 2013 1 8 2.56 ## 9 2013 1 9 2.30 ## 10 2013 1 10 2.84 ## # ... with 355 more rows
#> # A tibble: 365 x 4 #> # Groups: year, month [12] #> year month day mean #> <int> <int> <int> <dbl> #> 1 2013 1 1 11.4 #> 2 2013 1 2 13.7 #> 3 2013 1 3 10.9 #> 4 2013 1 4 8.97 #> 5 2013 1 5 5.73 #> 6 2013 1 6 7.15 #> # . with 359 more rows
5.6.3 Counts
delays <- not_cancelled %>% group_by(tailnum) %>% summarise( delay = mean(arr_delay) )
ggplot(data = delays, mapping = aes(x = delay)) + geom_freqpoly(binwidth = 10)

delays <- not_cancelled %>% group_by(tailnum) %>% summarise( delay = mean(arr_delay, na.rm = TRUE), n = n() )
ggplot(data = delays, mapping = aes(x = n, y = delay)) + geom_point(alpha = 1/10)

delays %>% filter(n > 25) %>% ggplot(mapping = aes(x = n, y = delay)) + geom_point(alpha = 1/10)

delays <- not_cancelled %>% group_by(tailnum) %>% summarise( delay = mean(arr_delay) )
ggplot(data = delays, mapping = aes(x = delay)) + geom_freqpoly(binwidth = 10)

delays <- not_cancelled %>% group_by(tailnum) %>% summarise( delay = mean(arr_delay, na.rm = TRUE), n = n() )
ggplot(data = delays, mapping = aes(x = n, y = delay)) + geom_point(alpha = 1/10)

delays %>% filter(n > 25) %>% ggplot(mapping = aes(x = n, y = delay)) + geom_point(alpha = 1/10)

# Convert to a tibble so it prints nicely batting <- as_tibble(Lahman::Batting)
## Error in loadNamespace(name): there is no package called 'Lahman'
batters <- batting %>% group_by(playerID) %>% summarise( ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE), ab = sum(AB, na.rm = TRUE) )
## Error in eval(lhs, parent, parent): object 'batting' not found
batters %>% filter(ab > 100) %>% ggplot(mapping = aes(x = ab, y = ba)) + geom_point() + geom_smooth(se = FALSE)
## Error in eval(lhs, parent, parent): object 'batters' not found
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
5.6.4 Useful summary functions
not_cancelled %>% group_by(year, month, day) %>% summarise( avg_delay1 = mean(arr_delay), avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay )
## # A tibble: 365 x 5 ## # Groups: year, month [12] ## year month day avg_delay1 avg_delay2 ## <int> <int> <int> <dbl> <dbl> ## 1 2013 1 1 12.7 32.5 ## 2 2013 1 2 12.7 32.0 ## 3 2013 1 3 5.73 27.7 ## 4 2013 1 4 -1.93 28.3 ## 5 2013 1 5 -1.53 22.6 ## 6 2013 1 6 4.24 24.4 ## 7 2013 1 7 -4.95 27.8 ## 8 2013 1 8 -3.23 20.8 ## 9 2013 1 9 -0.264 25.6 ## 10 2013 1 10 -5.90 27.3 ## # ... with 355 more rows
#> # A tibble: 365 x 5 #> # Groups: year, month [12] #> year month day avg_delay1 avg_delay2 #> <int> <int> <int> <dbl> <dbl> #> 1 2013 1 1 12.7 32.5 #> 2 2013 1 2 12.7 32.0 #> 3 2013 1 3 5.73 27.7 #> 4 2013 1 4 -1.93 28.3 #> 5 2013 1 5 -1.53 22.6 #> 6 2013 1 6 4.24 24.4 #> # . with 359 more rows
# Why is distance to some destinations more variable than to others? not_cancelled %>% group_by(dest) %>% summarise(distance_sd = sd(distance)) %>% arrange(desc(distance_sd))
## # A tibble: 104 x 2 ## dest distance_sd ## <chr> <dbl> ## 1 EGE 10.5 ## 2 SAN 10.4 ## 3 SFO 10.2 ## 4 HNL 10.0 ## 5 SEA 9.98 ## 6 LAS 9.91 ## 7 PDX 9.87 ## 8 PHX 9.86 ## 9 LAX 9.66 ## 10 IND 9.46 ## # ... with 94 more rows
#> # A tibble: 104 x 2 #> dest distance_sd #> <chr> <dbl> #> 1 EGE 10.5 #> 2 SAN 10.4 #> 3 SFO 10.2 #> 4 HNL 10.0 #> 5 SEA 9.98 #> 6 LAS 9.91 #> # . with 98 more rows
# When do the first and last flights leave each day? not_cancelled %>% group_by(year, month, day) %>% summarise( first = min(dep_time), last = max(dep_time) )
## # A tibble: 365 x 5 ## # Groups: year, month [12] ## year month day first last ## <int> <int> <int> <int> <int> ## 1 2013 1 1 517 2356 ## 2 2013 1 2 42 2354 ## 3 2013 1 3 32 2349 ## 4 2013 1 4 25 2358 ## 5 2013 1 5 14 2357 ## 6 2013 1 6 16 2355 ## 7 2013 1 7 49 2359 ## 8 2013 1 8 454 2351 ## 9 2013 1 9 2 2252 ## 10 2013 1 10 3 2320 ## # ... with 355 more rows
#> # A tibble: 365 x 5 #> # Groups: year, month [12] #> year month day first last #> <int> <int> <int> <int> <int> #> 1 2013 1 1 517 2356 #> 2 2013 1 2 42 2354 #> 3 2013 1 3 32 2349 #> 4 2013 1 4 25 2358 #> 5 2013 1 5 14 2357 #> 6 2013 1 6 16 2355 #> # . with 359 more rows
# When do the first and last flights leave each day? not_cancelled %>% group_by(year, month, day) %>% summarise( first = min(dep_time), last = max(dep_time) )
## # A tibble: 365 x 5 ## # Groups: year, month [12] ## year month day first last ## <int> <int> <int> <int> <int> ## 1 2013 1 1 517 2356 ## 2 2013 1 2 42 2354 ## 3 2013 1 3 32 2349 ## 4 2013 1 4 25 2358 ## 5 2013 1 5 14 2357 ## 6 2013 1 6 16 2355 ## 7 2013 1 7 49 2359 ## 8 2013 1 8 454 2351 ## 9 2013 1 9 2 2252 ## 10 2013 1 10 3 2320 ## # ... with 355 more rows
#> # A tibble: 365 x 5 #> # Groups: year, month [12] #> year month day first last #> <int> <int> <int> <int> <int> #> 1 2013 1 1 517 2356 #> 2 2013 1 2 42 2354 #> 3 2013 1 3 32 2349 #> 4 2013 1 4 25 2358 #> 5 2013 1 5 14 2357 #> 6 2013 1 6 16 2355 #> # . with 359 more rows
5.6.5 Grouping by multiple variables
daily <- group_by(flights, year, month, day) (per_day <- summarise(daily, flights = n()))
## # A tibble: 365 x 4 ## # Groups: year, month [12] ## year month day flights ## <int> <int> <int> <int> ## 1 2013 1 1 842 ## 2 2013 1 2 943 ## 3 2013 1 3 914 ## 4 2013 1 4 915 ## 5 2013 1 5 720 ## 6 2013 1 6 832 ## 7 2013 1 7 933 ## 8 2013 1 8 899 ## 9 2013 1 9 902 ## 10 2013 1 10 932 ## # ... with 355 more rows
#> # A tibble: 365 x 4 #> # Groups: year, month [12] #> year month day flights #> <int> <int> <int> <int> #> 1 2013 1 1 842 #> 2 2013 1 2 943 #> 3 2013 1 3 914 #> 4 2013 1 4 915 #> 5 2013 1 5 720 #> 6 2013 1 6 832 #> # . with 359 more rows (per_month <- summarise(per_day, flights = sum(flights)))
## # A tibble: 12 x 3 ## # Groups: year [1] ## year month flights ## <int> <int> <int> ## 1 2013 1 27004 ## 2 2013 2 24951 ## 3 2013 3 28834 ## 4 2013 4 28330 ## 5 2013 5 28796 ## 6 2013 6 28243 ## 7 2013 7 29425 ## 8 2013 8 29327 ## 9 2013 9 27574 ## 10 2013 10 28889 ## 11 2013 11 27268 ## 12 2013 12 28135
#> # A tibble: 12 x 3 #> # Groups: year [1] #> year month flights #> <int> <int> <int> #> 1 2013 1 27004 #> 2 2013 2 24951 #> 3 2013 3 28834 #> 4 2013 4 28330 #> 5 2013 5 28796 #> 6 2013 6 28243 #> # . with 6 more rows (per_year <- summarise(per_month, flights = sum(flights)))
## # A tibble: 1 x 2 ## year flights ## <int> <int> ## 1 2013 336776
#> # A tibble: 1 x 2 #> year flights #> <int> <int> #> 1 2013 336776
5.6.6 Ungrouping
daily %>% ungroup() %>% # no longer grouped by date summarise(flights = n()) # all flights
## # A tibble: 1 x 1 ## flights ## <int> ## 1 336776
#> # A tibble: 1 x 1 #> flights #> <int> #> 1 336776
5.7 Grouped mutates (and filters)
flights_sml %>% group_by(year, month, day) %>% filter(rank(desc(arr_delay)) < 10)
## # A tibble: 3,306 x 7 ## # Groups: year, month, day [365] ## year month day dep_delay arr_delay distance air_time ## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> ## 1 2013 1 1 853 851 184 41 ## 2 2013 1 1 290 338 1134 213 ## 3 2013 1 1 260 263 266 46 ## 4 2013 1 1 157 174 213 60 ## 5 2013 1 1 216 222 708 121 ## 6 2013 1 1 255 250 589 115 ## 7 2013 1 1 285 246 1085 146 ## 8 2013 1 1 192 191 199 44 ## 9 2013 1 1 379 456 1092 222 ## 10 2013 1 2 224 207 550 94 ## # ... with 3,296 more rows
#> # A tibble: 3,306 x 7 #> # Groups: year, month, day [365] #> year month day dep_delay arr_delay distance air_time #> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> #> 1 2013 1 1 853 851 184 41 #> 2 2013 1 1 290 338 1134 213 #> 3 2013 1 1 260 263 266 46 #> 4 2013 1 1 157 174 213 60 #> 5 2013 1 1 216 222 708 121 #> 6 2013 1 1 255 250 589 115 #> # . with 3,300 more rows
popular_dests <- flights %>% group_by(dest) %>% filter(n() > 365) popular_dests
## # A tibble: 332,577 x 19 ## # Groups: dest [77] ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 517 515 2 830 819 ## 2 2013 1 1 533 529 4 850 830 ## 3 2013 1 1 542 540 2 923 850 ## 4 2013 1 1 544 545 -1 1004 1022 ## 5 2013 1 1 554 600 -6 812 837 ## 6 2013 1 1 554 558 -4 740 728 ## 7 2013 1 1 555 600 -5 913 854 ## 8 2013 1 1 557 600 -3 709 723 ## 9 2013 1 1 557 600 -3 838 846 ## 10 2013 1 1 558 600 -2 753 745 ## # ... with 332,567 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#> # A tibble: 332,577 x 19 #> # Groups: dest [77] #> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time #> <int> <int> <int> <int> <int> <dbl> <int> <int> #> 1 2013 1 1 517 515 2 830 819 #> 2 2013 1 1 533 529 4 850 830 #> 3 2013 1 1 542 540 2 923 850 #> 4 2013 1 1 544 545 -1 1004 1022 #> 5 2013 1 1 554 600 -6 812 837 #> 6 2013 1 1 554 558 -4 740 728 #> # . with 332,571 more rows, and 11 more variables: arr_delay <dbl>, #> # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, #> # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
popular_dests %>% filter(arr_delay > 0) %>% mutate(prop_delay = arr_delay / sum(arr_delay)) %>% select(year:day, dest, arr_delay, prop_delay)
## # A tibble: 131,106 x 6 ## # Groups: dest [77] ## year month day dest arr_delay prop_delay ## <int> <int> <int> <chr> <dbl> <dbl> ## 1 2013 1 1 IAH 11 0.000111 ## 2 2013 1 1 IAH 20 0.000201 ## 3 2013 1 1 MIA 33 0.000235 ## 4 2013 1 1 ORD 12 0.0000424 ## 5 2013 1 1 FLL 19 0.0000938 ## 6 2013 1 1 ORD 8 0.0000283 ## 7 2013 1 1 LAX 7 0.0000344 ## 8 2013 1 1 DFW 31 0.000282 ## 9 2013 1 1 ATL 12 0.0000400 ## 10 2013 1 1 DTW 16 0.000116 ## # ... with 131,096 more rows
#> # A tibble: 131,106 x 6 #> # Groups: dest [77] #> year month day dest arr_delay prop_delay #> <int> <int> <int> <chr> <dbl> <dbl> #> 1 2013 1 1 IAH 11 0.000111 #> 2 2013 1 1 IAH 20 0.000201 #> 3 2013 1 1 MIA 33 0.000235 #> 4 2013 1 1 ORD 12 0.0000424 #> 5 2013 1 1 FLL 19 0.0000938 #> 6 2013 1 1 ORD 8 0.0000283 #> # . with 131,100 more rows
5.2.4 Exercises
1 - Find all flights that 1. Had an arrival delay of two or more hours
library(tidyverse) flights <- nycflights13::flights filter(flights, arr_delay >= 120)
## # A tibble: 10,200 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 811 630 101 1047 830 ## 2 2013 1 1 848 1835 853 1001 1950 ## 3 2013 1 1 957 733 144 1056 853 ## 4 2013 1 1 1114 900 134 1447 1222 ## 5 2013 1 1 1505 1310 115 1638 1431 ## 6 2013 1 1 1525 1340 105 1831 1626 ## 7 2013 1 1 1549 1445 64 1912 1656 ## 8 2013 1 1 1558 1359 119 1718 1515 ## 9 2013 1 1 1732 1630 62 2028 1825 ## 10 2013 1 1 1803 1620 103 2008 1750 ## # ... with 10,190 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
2. Flew to Houston (IAH or HOU)
filter(flights, dest %in% c('IAH', 'HOU'))
## # A tibble: 9,313 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 517 515 2 830 819 ## 2 2013 1 1 533 529 4 850 830 ## 3 2013 1 1 623 627 -4 933 932 ## 4 2013 1 1 728 732 -4 1041 1038 ## 5 2013 1 1 739 739 0 1104 1038 ## 6 2013 1 1 908 908 0 1228 1219 ## 7 2013 1 1 1028 1026 2 1350 1339 ## 8 2013 1 1 1044 1045 -1 1352 1351 ## 9 2013 1 1 1114 900 134 1447 1222 ## 10 2013 1 1 1205 1200 5 1503 1505 ## # ... with 9,303 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
3. Were operated by United, American, or Delta
filter(flights, carrier %in% c('UA','AA','DL'))
## # A tibble: 139,504 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 517 515 2 830 819 ## 2 2013 1 1 533 529 4 850 830 ## 3 2013 1 1 542 540 2 923 850 ## 4 2013 1 1 554 600 -6 812 837 ## 5 2013 1 1 554 558 -4 740 728 ## 6 2013 1 1 558 600 -2 753 745 ## 7 2013 1 1 558 600 -2 924 917 ## 8 2013 1 1 558 600 -2 923 937 ## 9 2013 1 1 559 600 -1 941 910 ## 10 2013 1 1 559 600 -1 854 902 ## # ... with 139,494 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, month %in% c(7,8,9))
filter(flights, month %in% c(7,8,9))
## # A tibble: 86,326 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 7 1 1 2029 212 236 2359 ## 2 2013 7 1 2 2359 3 344 344 ## 3 2013 7 1 29 2245 104 151 1 ## 4 2013 7 1 43 2130 193 322 14 ## 5 2013 7 1 44 2150 174 300 100 ## 6 2013 7 1 46 2051 235 304 2358 ## 7 2013 7 1 48 2001 287 308 2305 ## 8 2013 7 1 58 2155 183 335 43 ## 9 2013 7 1 100 2146 194 327 30 ## 10 2013 7 1 100 2245 135 337 135 ## # ... with 86,316 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
5. Arrived more than two hours late, but didn't leave late
filter(flights, dep_delay <= 0, arr_delay >= 120)
## # A tibble: 29 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 27 1419 1420 -1 1754 1550 ## 2 2013 10 7 1350 1350 0 1736 1526 ## 3 2013 10 7 1357 1359 -2 1858 1654 ## 4 2013 10 16 657 700 -3 1258 1056 ## 5 2013 11 1 658 700 -2 1329 1015 ## 6 2013 3 18 1844 1847 -3 39 2219 ## 7 2013 4 17 1635 1640 -5 2049 1845 ## 8 2013 4 18 558 600 -2 1149 850 ## 9 2013 4 18 655 700 -5 1213 950 ## 10 2013 5 22 1827 1830 -3 2217 2010 ## # ... with 19 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>, ## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, ## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
6. Were delayed by at least an hour, but made up over 30 minutes in flight
filter(flights, dep_delay >= 60, dep_delay > arr_delay + 30)
## # A tibble: 1,844 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 2205 1720 285 46 2040 ## 2 2013 1 1 2326 2130 116 131 18 ## 3 2013 1 3 1503 1221 162 1803 1555 ## 4 2013 1 3 1839 1700 99 2056 1950 ## 5 2013 1 3 1850 1745 65 2148 2120 ## 6 2013 1 3 1941 1759 102 2246 2139 ## 7 2013 1 3 1950 1845 65 2228 2227 ## 8 2013 1 3 2015 1915 60 2135 2111 ## 9 2013 1 3 2257 2000 177 45 2224 ## 10 2013 1 4 1917 1700 137 2135 1950 ## # ... with 1,834 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
7. Departed between midnight and 6am (inclusive)
filter(flights, dep_time >= 0, dep_time <= 600)
## # A tibble: 9,344 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 517 515 2 830 819 ## 2 2013 1 1 533 529 4 850 830 ## 3 2013 1 1 542 540 2 923 850 ## 4 2013 1 1 544 545 -1 1004 1022 ## 5 2013 1 1 554 600 -6 812 837 ## 6 2013 1 1 554 558 -4 740 728 ## 7 2013 1 1 555 600 -5 913 854 ## 8 2013 1 1 557 600 -3 709 723 ## 9 2013 1 1 557 600 -3 838 846 ## 10 2013 1 1 558 600 -2 753 745 ## # ... with 9,334 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
2 - Another useful dplyr filtering helper is between(). What does it do? Can you use it to simplify the code needed to answer the previous challenges?
filter(flights, between(dep_time, 0 , 600))
## # A tibble: 9,344 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 517 515 2 830 819 ## 2 2013 1 1 533 529 4 850 830 ## 3 2013 1 1 542 540 2 923 850 ## 4 2013 1 1 544 545 -1 1004 1022 ## 5 2013 1 1 554 600 -6 812 837 ## 6 2013 1 1 554 558 -4 740 728 ## 7 2013 1 1 555 600 -5 913 854 ## 8 2013 1 1 557 600 -3 709 723 ## 9 2013 1 1 557 600 -3 838 846 ## 10 2013 1 1 558 600 -2 753 745 ## # ... with 9,334 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
3 - How many flights have a missing dep_time? What other variables are missing? What might these rows represent?
sum(is.na(flights$dep_time))
## [1] 8255
filter(flights, is.na(dep_time))
## # A tibble: 8,255 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 NA 1630 NA NA 1815 ## 2 2013 1 1 NA 1935 NA NA 2240 ## 3 2013 1 1 NA 1500 NA NA 1825 ## 4 2013 1 1 NA 600 NA NA 901 ## 5 2013 1 2 NA 1540 NA NA 1747 ## 6 2013 1 2 NA 1620 NA NA 1746 ## 7 2013 1 2 NA 1355 NA NA 1459 ## 8 2013 1 2 NA 1420 NA NA 1644 ## 9 2013 1 2 NA 1321 NA NA 1536 ## 10 2013 1 2 NA 1545 NA NA 1910 ## # ... with 8,245 more rows, and 11 more variables: arr_delay <dbl>, ## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, ## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
4 - Why is NA ^ 0 not missing? Why is NA | TRUE not missing? Why is FALSE & NA not missing? Can you figure out the general rule? (NA \* 0 is a tricky counterexample!)
NA * 0 gives NA
5.3.1 Exercises
1 -How could you use arrange() to sort all missing values to the start? (Hint: use is.na()). 2 -Sort flights to find the most delayed flights. Find the flights that left earliest. 3 - Sort flights to find the fastest (highest speed) flights. 4 - Which flights travelled the farthest? Which travelled the shortest?
head(arrange(flights, desc(is.na(dep_delay))))
## # A tibble: 6 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 NA 1630 NA NA 1815 ## 2 2013 1 1 NA 1935 NA NA 2240 ## 3 2013 1 1 NA 1500 NA NA 1825 ## 4 2013 1 1 NA 600 NA NA 901 ## 5 2013 1 2 NA 1540 NA NA 1747 ## 6 2013 1 2 NA 1620 NA NA 1746 ## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>, ## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, ## # hour <dbl>, minute <dbl>, time_hour <dttm>
head(arrange(flights, desc(dep_delay)))
## # A tibble: 6 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 9 641 900 1301 1242 1530 ## 2 2013 6 15 1432 1935 1137 1607 2120 ## 3 2013 1 10 1121 1635 1126 1239 1810 ## 4 2013 9 20 1139 1845 1014 1457 2210 ## 5 2013 7 22 845 1600 1005 1044 1815 ## 6 2013 4 10 1100 1900 960 1342 2211 ## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>, ## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, ## # hour <dbl>, minute <dbl>, time_hour <dttm>
head(arrange(flights, dep_delay))
## # A tibble: 6 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 12 7 2040 2123 -43 40 2352 ## 2 2013 2 3 2022 2055 -33 2240 2338 ## 3 2013 11 10 1408 1440 -32 1549 1559 ## 4 2013 1 11 1900 1930 -30 2233 2243 ## 5 2013 1 29 1703 1730 -27 1947 1957 ## 6 2013 8 9 729 755 -26 1002 955 ## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>, ## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, ## # hour <dbl>, minute <dbl>, time_hour <dttm>
head(arrange(flights, air_time))
## # A tibble: 6 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 16 1355 1315 40 1442 1411 ## 2 2013 4 13 537 527 10 622 628 ## 3 2013 12 6 922 851 31 1021 954 ## 4 2013 2 3 2153 2129 24 2247 2224 ## 5 2013 2 5 1303 1315 -12 1342 1411 ## 6 2013 2 12 2123 2130 -7 2211 2225 ## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>, ## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, ## # hour <dbl>, minute <dbl>, time_hour <dttm>
head(arrange(flights, desc(distance)))
## # A tibble: 6 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 857 900 -3 1516 1530 ## 2 2013 1 2 909 900 9 1525 1530 ## 3 2013 1 3 914 900 14 1504 1530 ## 4 2013 1 4 900 900 0 1516 1530 ## 5 2013 1 5 858 900 -2 1519 1530 ## 6 2013 1 6 1019 900 79 1558 1530 ## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>, ## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, ## # hour <dbl>, minute <dbl>, time_hour <dttm>
head(arrange(flights, distance))
## # A tibble: 6 x 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 7 27 NA 106 NA NA 245 ## 2 2013 1 3 2127 2129 -2 2222 2224 ## 3 2013 1 4 1240 1200 40 1333 1306 ## 4 2013 1 4 1829 1615 134 1937 1721 ## 5 2013 1 4 2128 2129 -1 2218 2224 ## 6 2013 1 5 1155 1200 -5 1241 1306 ## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>, ## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, ## # hour <dbl>, minute <dbl>, time_hour <dttm>
5.4.1 Exercises 1 -Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights. 2 -What happens if you include the name of a variable multiple times in a select() call? 3 -What does the one_of() function do? Why might it be helpful in conjunction with this vector? vars <- c("year", "month", "day", "dep_delay", "arr_delay") 4- Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default? select(flights, contains("TIME"))
select(flights, dep_time, dep_delay, arr_time, arr_delay)
## # A tibble: 336,776 x 4 ## dep_time dep_delay arr_time arr_delay ## <int> <dbl> <int> <dbl> ## 1 517 2 830 11 ## 2 533 4 850 20 ## 3 542 2 923 33 ## 4 544 -1 1004 -18 ## 5 554 -6 812 -25 ## 6 554 -4 740 12 ## 7 555 -5 913 19 ## 8 557 -3 709 -14 ## 9 557 -3 838 -8 ## 10 558 -2 753 8 ## # ... with 336,766 more rows
select(flights, starts_with('dep'), starts_with('arr'))
## # A tibble: 336,776 x 4 ## dep_time dep_delay arr_time arr_delay ## <int> <dbl> <int> <dbl> ## 1 517 2 830 11 ## 2 533 4 850 20 ## 3 542 2 923 33 ## 4 544 -1 1004 -18 ## 5 554 -6 812 -25 ## 6 554 -4 740 12 ## 7 555 -5 913 19 ## 8 557 -3 709 -14 ## 9 557 -3 838 -8 ## 10 558 -2 753 8 ## # ... with 336,766 more rows
select(flights, dep_time, dep_time, dep_time)
## # A tibble: 336,776 x 1 ## dep_time ## <int> ## 1 517 ## 2 533 ## 3 542 ## 4 544 ## 5 554 ## 6 554 ## 7 555 ## 8 557 ## 9 557 ## 10 558 ## # ... with 336,766 more rows
vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, one_of(vars))
## # A tibble: 336,776 x 5 ## year month day dep_delay arr_delay ## <int> <int> <int> <dbl> <dbl> ## 1 2013 1 1 2 11 ## 2 2013 1 1 4 20 ## 3 2013 1 1 2 33 ## 4 2013 1 1 -1 -18 ## 5 2013 1 1 -6 -25 ## 6 2013 1 1 -4 12 ## 7 2013 1 1 -5 19 ## 8 2013 1 1 -3 -14 ## 9 2013 1 1 -3 -8 ## 10 2013 1 1 -2 8 ## # ... with 336,766 more rows
select(flights, contains("TIME"))
## # A tibble: 336,776 x 6 ## dep_time sched_dep_time arr_time sched_arr_time air_time time_hour ## <int> <int> <int> <int> <dbl> <dttm> ## 1 517 515 830 819 227 2013-01-01 05:00:00 ## 2 533 529 850 830 227 2013-01-01 05:00:00 ## 3 542 540 923 850 160 2013-01-01 05:00:00 ## 4 544 545 1004 1022 183 2013-01-01 05:00:00 ## 5 554 600 812 837 116 2013-01-01 06:00:00 ## 6 554 558 740 728 150 2013-01-01 05:00:00 ## 7 555 600 913 854 158 2013-01-01 06:00:00 ## 8 557 600 709 723 53 2013-01-01 06:00:00 ## 9 557 600 838 846 140 2013-01-01 06:00:00 ## 10 558 600 753 745 138 2013-01-01 06:00:00 ## # ... with 336,766 more rows
select(flights, contains("TIME", ignore.case = FALSE))
## # A tibble: 336,776 x 0
5.5.2 Exercises 1 -Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they're not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight. 2 -Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it? 3 - Compare dep_time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related? 4 -Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank(). 5 -What does 1:3 + 1:10 return? Why? 6 -What trigonometric functions does R provide?
flights <- mutate(flights, dep_time_mins = dep_time %/% 100 * 60 + dep_time %% 100, sched_dep_time_mins = sched_dep_time %/% 100 * 60 + sched_dep_time %% 100) select(flights, starts_with('dep_time'), starts_with('sched'))
## # A tibble: 336,776 x 5 ## dep_time dep_time_mins sched_dep_time sched_arr_time sched_dep_time_mins ## <int> <dbl> <int> <int> <dbl> ## 1 517 317 515 819 315 ## 2 533 333 529 830 329 ## 3 542 342 540 850 340 ## 4 544 344 545 1022 345 ## 5 554 354 600 837 360 ## 6 554 354 558 728 358 ## 7 555 355 600 854 360 ## 8 557 357 600 723 360 ## 9 557 357 600 846 360 ## 10 558 358 600 745 360 ## # ... with 336,766 more rows
flights %>% mutate(flight_time = arr_time - dep_time) %>% select(air_time, flight_time)
## # A tibble: 336,776 x 2 ## air_time flight_time ## <dbl> <int> ## 1 227 313 ## 2 227 317 ## 3 160 381 ## 4 183 460 ## 5 116 258 ## 6 150 186 ## 7 158 358 ## 8 53 152 ## 9 140 281 ## 10 138 195 ## # ... with 336,766 more rows
flights <- mutate(flights, arr_time_mins = arr_time %/% 100 * 60 + arr_time %% 100) flights <- mutate(flights, flight_time = arr_time_mins - dep_time_mins) select(flights, air_time, flight_time)
## # A tibble: 336,776 x 2 ## air_time flight_time ## <dbl> <dbl> ## 1 227 193 ## 2 227 197 ## 3 160 221 ## 4 183 260 ## 5 116 138 ## 6 150 106 ## 7 158 198 ## 8 53 72 ## 9 140 161 ## 10 138 115 ## # ... with 336,766 more rows
sum(flights$air_time == flights$flight_time, na.rm = TRUE)
## [1] 196
select(flights, dep_time, sched_dep_time, dep_delay)
## # A tibble: 336,776 x 3 ## dep_time sched_dep_time dep_delay ## <int> <int> <dbl> ## 1 517 515 2 ## 2 533 529 4 ## 3 542 540 2 ## 4 544 545 -1 ## 5 554 600 -6 ## 6 554 558 -4 ## 7 555 600 -5 ## 8 557 600 -3 ## 9 557 600 -3 ## 10 558 600 -2 ## # ... with 336,766 more rows
head(arrange(flights, min_rank(desc(dep_delay))), 10)
## # A tibble: 10 x 23 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 9 641 900 1301 1242 1530 ## 2 2013 6 15 1432 1935 1137 1607 2120 ## 3 2013 1 10 1121 1635 1126 1239 1810 ## 4 2013 9 20 1139 1845 1014 1457 2210 ## 5 2013 7 22 845 1600 1005 1044 1815 ## 6 2013 4 10 1100 1900 960 1342 2211 ## 7 2013 3 17 2321 810 911 135 1020 ## 8 2013 6 27 959 1900 899 1236 2226 ## 9 2013 7 22 2257 759 898 121 1026 ## 10 2013 12 5 756 1700 896 1058 2020 ## # ... with 15 more variables: arr_delay <dbl>, carrier <chr>, flight <int>, ## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, ## # hour <dbl>, minute <dbl>, time_hour <dttm>, dep_time_mins <dbl>, ## # sched_dep_time_mins <dbl>, arr_time_mins <dbl>, flight_time <dbl>
## # A tibble: 10 x 23 1:3 + 1:10
## Warning in 1:3 + 1:10: longer object length is not a multiple of shorter object ## length
## [1] 2 4 6 5 7 9 8 10 12 11
?sin
5.6.7 Exercises 1 - Brainstorm at least 5 different ways to assess the typical delay characteristics of a group of flights. Consider the following scenarios: A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time. A flight is always 10 minutes late. A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time. 99% of the time a flight is on time. 1% of the time it's 2 hours late. Which is more important: arrival delay or departure delay? 2 - Come up with another approach that will give you the same output as not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum, wt = distance) (without using count()). 3 -Our definition of cancelled flights (is.na(dep_delay) | is.na(arr_delay) ) is slightly suboptimal. Why? Which is the most important column? 4 - Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay? 5 - Which carrier has the worst delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights %>% group_by(carrier, dest) %>% summarise(n())) 6 -What does the sort argument to count() do. When might you use it?
flights %>% filter(!is.na(dep_delay), !is.na(arr_delay)) %>% group_by(dest) %>% summarize(count = n())
## # A tibble: 104 x 2 ## dest count ## <chr> <int> ## 1 ABQ 254 ## 2 ACK 264 ## 3 ALB 418 ## 4 ANC 8 ## 5 ATL 16837 ## 6 AUS 2411 ## 7 AVL 261 ## 8 BDL 412 ## 9 BGR 358 ## 10 BHM 269 ## # ... with 94 more rows
flights %>% filter(!is.na(dep_delay), !is.na(arr_delay)) %>% group_by(tailnum) %>% summarize(count = sum(distance))
## # A tibble: 4,037 x 2 ## tailnum count ## <chr> <dbl> ## 1 D942DN 3418 ## 2 N0EGMQ 239143 ## 3 N10156 109664 ## 4 N102UW 25722 ## 5 N103US 24619 ## 6 N104UW 24616 ## 7 N10575 139903 ## 8 N105UW 23618 ## 9 N107US 21677 ## 10 N108UW 32070 ## # ... with 4,027 more rows
flights %>% select(starts_with("dep"), starts_with("arr")) %>% sapply(function(x){sum(is.na(x))})
## dep_time dep_delay dep_time_mins arr_time arr_delay ## 8255 8255 8255 8713 9430 ## arr_time_mins ## 8713
nrow(flights %>% filter(!is.na(dep_time), is.na(arr_time)))
## [1] 458
flights %>% group_by(month, day) %>% summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE), prop_cancelled = sum(is.na(dep_time)/n())) %>% ggplot(mapping = aes(x = avg_dep_delay, y = prop_cancelled)) + geom_point() + geom_smooth(method = 'lm', se = FALSE)

worst <- flights %>% group_by(carrier) %>% summarize(avg_arr_delay = mean(arr_delay, na.rm = TRUE), avg_dep_delay = mean(dep_delay, na.rm = TRUE))
arrange(worst, desc(avg_arr_delay))
## # A tibble: 16 x 3 ## carrier avg_arr_delay avg_dep_delay ## <chr> <dbl> <dbl> ## 1 F9 21.9 20.2 ## 2 FL 20.1 18.7 ## 3 EV 15.8 20.0 ## 4 YV 15.6 19.0 ## 5 OO 11.9 12.6 ## 6 MQ 10.8 10.6 ## 7 WN 9.65 17.7 ## 8 B6 9.46 13.0 ## 9 9E 7.38 16.7 ## 10 UA 3.56 12.1 ## 11 US 2.13 3.78 ## 12 VX 1.76 12.9 ## 13 DL 1.64 9.26 ## 14 AA 0.364 8.59 ## 15 HA -6.92 4.90 ## 16 AS -9.93 5.80
arrange(worst, desc(avg_dep_delay))
## # A tibble: 16 x 3 ## carrier avg_arr_delay avg_dep_delay ## <chr> <dbl> <dbl> ## 1 F9 21.9 20.2 ## 2 EV 15.8 20.0 ## 3 YV 15.6 19.0 ## 4 FL 20.1 18.7 ## 5 WN 9.65 17.7 ## 6 9E 7.38 16.7 ## 7 B6 9.46 13.0 ## 8 VX 1.76 12.9 ## 9 OO 11.9 12.6 ## 10 UA 3.56 12.1 ## 11 MQ 10.8 10.6 ## 12 DL 1.64 9.26 ## 13 AA 0.364 8.59 ## 14 AS -9.93 5.80 ## 15 HA -6.92 4.90 ## 16 US 2.13 3.78
flights %>% group_by(origin) %>% summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 3 x 2 ## origin avg_dep_delay ## <chr> <dbl> ## 1 EWR 15.1 ## 2 JFK 12.1 ## 3 LGA 10.3
flights %>% group_by(origin) %>% summarize(avg_arr_delay = mean(arr_delay, na.rm = TRUE))
## # A tibble: 3 x 2 ## origin avg_arr_delay ## <chr> <dbl> ## 1 EWR 9.11 ## 2 JFK 5.55 ## 3 LGA 5.78
flights %>% group_by(carrier, origin) %>% summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>% filter(carrier == '9E')
## # A tibble: 3 x 3 ## # Groups: carrier [1] ## carrier origin avg_dep_delay ## <chr> <chr> <dbl> ## 1 9E EWR 5.95 ## 2 9E JFK 19.0 ## 3 9E LGA 8.89
flights %>% filter(!is.na(dep_delay), !is.na(arr_delay)) %>% count(dest)
## # A tibble: 104 x 2 ## dest n ## <chr> <int> ## 1 ABQ 254 ## 2 ACK 264 ## 3 ALB 418 ## 4 ANC 8 ## 5 ATL 16837 ## 6 AUS 2411 ## 7 AVL 261 ## 8 BDL 412 ## 9 BGR 358 ## 10 BHM 269 ## # ... with 94 more rows
flights %>% filter(!is.na(dep_delay), !is.na(arr_delay)) %>% count(dest, sort = TRUE)
## # A tibble: 104 x 2 ## dest n ## <chr> <int> ## 1 ATL 16837 ## 2 ORD 16566 ## 3 LAX 16026 ## 4 BOS 15022 ## 5 MCO 13967 ## 6 CLT 13674 ## 7 SFO 13173 ## 8 FLL 11897 ## 9 MIA 11593 ## 10 DCA 9111 ## # ... with 94 more rows
5.7.1 Exercises 1-Refer back to the lists of useful mutate and filtering functions. Describe how each operation changes when you combine it with grouping. 2 - Which plane (tailnum) has the worst on-time record? 3 -What time of day should you fly if you want to avoid delays as much as possible? 4 - For each destination, compute the total minutes of delay. For each flight, compute the proportion of the total delay for its destination. 5 -Delays are typically temporally correlated: even once the problem that caused the initial delay has been resolved, later flights are delayed to allow earlier flights to leave. Using lag(), explore how the delay of a flight is related to the delay of the immediately preceding flight. 6 - Look at each destination. Can you find flights that are suspiciously fast? (i.e. flights that represent a potential data entry error). Compute the air time of a flight relative to the shortest flight to that destination. Which flights were most delayed in the air? 7 -Find all destinations that are flown by at least two carriers. Use that information to rank the carriers. 8 - For each plane, count the number of flights before the first delay of greater than 1 hour.
grouping, functions like mean(), median(), min(), or max()
flights %>% group_by(tailnum) %>% filter(dep_delay > 0) %>% summarize(avg_delay = mean(dep_delay)) %>% arrange(desc(avg_delay))
## # A tibble: 3,885 x 2 ## tailnum avg_delay ## <chr> <dbl> ## 1 N844MH 297 ## 2 N452UW 291 ## 3 N922EV 274 ## 4 N587NW 272 ## 5 N911DA 268 ## 6 N665MQ 268. ## 7 N673MQ 257 ## 8 N851NW 233 ## 9 N654UA 227 ## 10 N550NW 212. ## # ... with 3,875 more rows
flights %>% filter (dep_delay > 0) %>% group_by(hour) %>% summarize(avg_delay = mean(dep_delay)) %>% arrange(avg_delay)
## # A tibble: 19 x 2 ## hour avg_delay ## <dbl> <dbl> ## 1 5 15.3 ## 2 7 24.1 ## 3 6 24.2 ## 4 9 29.7 ## 5 8 29.9 ## 6 12 32.3 ## 7 11 32.5 ## 8 10 32.6 ## 9 13 33.5 ## 10 14 37.1 ## 11 23 38.0 ## 12 15 38.8 ## 13 16 43.4 ## 14 17 45.3 ## 15 18 46.5 ## 16 22 46.5 ## 17 20 49.6 ## 18 21 50.3 ## 19 19 51.1
flights %>% select(dest, arr_delay) %>% group_by(dest) %>% filter(arr_delay > 0) %>% mutate(total_delay = sum(arr_delay, na.rm = TRUE), prop_delay = arr_delay / total_delay)
## # A tibble: 133,004 x 4 ## # Groups: dest [103] ## dest arr_delay total_delay prop_delay ## <chr> <dbl> <dbl> <dbl> ## 1 IAH 11 99391 0.000111 ## 2 IAH 20 99391 0.000201 ## 3 MIA 33 140424 0.000235 ## 4 ORD 12 283046 0.0000424 ## 5 FLL 19 202605 0.0000938 ## 6 ORD 8 283046 0.0000283 ## 7 LAX 7 203226 0.0000344 ## 8 DFW 31 110009 0.000282 ## 9 ATL 12 300299 0.0000400 ## 10 DTW 16 138258 0.000116 ## # ... with 132,994 more rows
flights %>% filter(origin == 'JFK') %>% filter(!is.na(dep_delay)) %>% mutate(pre_dep_delay = lag(dep_delay, default = 0)) %>% ggplot(mapping = aes(x = dep_delay, y= pre_dep_delay)) + geom_point(alpha = .5)

flights <- flights %>% filter(origin == 'JFK') %>% filter(!is.na(dep_delay)) %>% mutate(pre_dep_delay = lag(dep_delay, default = 0)) cor(flights$dep_delay, flights$pre_dep_delay)
## [1] 0.2928064
flights %>% filter(!is.na(air_time)) %>% group_by(dest) %>% mutate(air_time_mean = mean(air_time), air_time_sd = sd(air_time), z = (air_time - air_time_mean) / air_time_sd) %>% select(z, air_time_mean, dest, everything()) %>% arrange(z)
## # A tibble: 109,079 x 27 ## # Groups: dest [70] ## z air_time_mean dest year month day dep_time sched_dep_time dep_delay ## <dbl> <dbl> <chr> <int> <int> <int> <int> <int> <dbl> ## 1 -4.10 57.1 BUF 2013 11 10 2307 2250 17 ## 2 -3.80 51.9 ROC 2013 3 25 2340 2250 50 ## 3 -3.55 329. SEA 2013 7 3 1533 1459 34 ## 4 -3.51 52.6 ORF 2013 11 3 1720 1645 35 ## 5 -3.40 66.5 PIT 2013 7 17 1504 1505 -1 ## 6 -3.03 66.5 PIT 2013 5 8 1947 1859 48 ## 7 -3.03 329. SEA 2013 5 6 1728 1730 -2 ## 8 -2.99 331. PDX 2013 5 6 1753 1755 -2 ## 9 -2.98 329. LAX 2013 9 6 2042 2025 17 ## 10 -2.97 329. SEA 2013 5 6 1553 1557 -4 ## # ... with 109,069 more rows, and 18 more variables: arr_time <int>, ## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, ## # tailnum <chr>, origin <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>, dep_time_mins <dbl>, ## # sched_dep_time_mins <dbl>, arr_time_mins <dbl>, flight_time <dbl>, ## # pre_dep_delay <dbl>, air_time_sd <dbl>
flights %>% filter(!is.na(air_time)) %>% group_by(dest) %>% mutate(air_time_mean = mean(air_time), air_time_sd = sd(air_time), z = (air_time - air_time_mean) / air_time_sd) %>% select(z, air_time_mean, dest, everything()) %>% arrange(desc(z))
## # A tibble: 109,079 x 27 ## # Groups: dest [70] ## z air_time_mean dest year month day dep_time sched_dep_time dep_delay ## <dbl> <dbl> <chr> <int> <int> <int> <int> <int> <dbl> ## 1 13.9 44.5 SYR 2013 9 1 2237 1711 326 ## 2 12.2 42.1 ACK 2013 6 29 755 800 -5 ## 3 11.8 38.5 BOS 2013 7 23 1617 1605 12 ## 4 10.8 38.5 BOS 2013 7 23 1200 1200 0 ## 5 10.4 71.9 RDU 2013 9 1 1719 1712 7 ## 6 10.3 57.1 BUF 2013 12 16 923 925 -2 ## 7 9.91 87.6 DTW 2013 7 27 1654 1620 34 ## 8 9.74 38.5 BOS 2013 2 17 841 840 1 ## 9 9.74 38.5 BOS 2013 7 23 1242 1245 -3 ## 10 9.69 51.9 ROC 2013 7 10 2350 2030 200 ## # ... with 109,069 more rows, and 18 more variables: arr_time <int>, ## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>, ## # tailnum <chr>, origin <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, ## # minute <dbl>, time_hour <dttm>, dep_time_mins <dbl>, ## # sched_dep_time_mins <dbl>, arr_time_mins <dbl>, flight_time <dbl>, ## # pre_dep_delay <dbl>, air_time_sd <dbl>
flights %>% group_by(dest) %>% summarise(num_carrier = length(unique(carrier))) %>% filter(num_carrier >= 2) %>% arrange(desc(num_carrier))
## # A tibble: 45 x 2 ## dest num_carrier ## <chr> <int> ## 1 LAX 5 ## 2 SFO 5 ## 3 TPA 5 ## 4 AUS 4 ## 5 BOS 4 ## 6 LAS 4 ## 7 PIT 4 ## 8 ATL 3 ## 9 BNA 3 ## 10 CLT 3 ## # ... with 35 more rows
flights %>% filter(!is.na(dep_delay)) %>% group_by(tailnum) %>% mutate(max_delay = cummax(dep_delay), less_one_hour = max_delay < 60) %>% summarize(count = sum(less_one_hour)) %>% arrange(desc(count))
## # A tibble: 1,957 x 2 ## tailnum count ## <chr> <int> ## 1 N705TW 159 ## 2 N706TW 149 ## 3 N713TW 128 ## 4 N721TW 120 ## 5 N5FAAA 117 ## 6 N3769L 104 ## 7 N804JB 104 ## 8 N3748Y 103 ## 9 N645JB 94 ## 10 N3742C 91 ## # ... with 1,947 more rows