library(nycflights13)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## nycflights13
nycflights13::flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, arr_delay >= 120)
## # A tibble: 10,200 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 811 630 101 1047 830
## 2 2013 1 1 848 1835 853 1001 1950
## 3 2013 1 1 957 733 144 1056 853
## 4 2013 1 1 1114 900 134 1447 1222
## 5 2013 1 1 1505 1310 115 1638 1431
## 6 2013 1 1 1525 1340 105 1831 1626
## 7 2013 1 1 1549 1445 64 1912 1656
## 8 2013 1 1 1558 1359 119 1718 1515
## 9 2013 1 1 1732 1630 62 2028 1825
## 10 2013 1 1 1803 1620 103 2008 1750
## # ... with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
options(tibble.width=Inf)
filter(flights, dest == "IAH" | dest == "HOU")
## # A tibble: 9,313 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 623 627 -4 933 932
## 4 2013 1 1 728 732 -4 1041 1038
## 5 2013 1 1 739 739 0 1104 1038
## 6 2013 1 1 908 908 0 1228 1219
## 7 2013 1 1 1028 1026 2 1350 1339
## 8 2013 1 1 1044 1045 -1 1352 1351
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1205 1200 5 1503 1505
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15
## 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29
## 3 1 UA 496 N459UA LGA IAH 229 1416 6 27
## 4 3 UA 473 N488UA LGA IAH 238 1416 7 32
## 5 26 UA 1479 N37408 EWR IAH 249 1400 7 39
## 6 9 UA 1220 N12216 EWR IAH 233 1400 9 8
## 7 11 UA 1004 N76508 LGA IAH 237 1416 10 26
## 8 1 UA 455 N667UA EWR IAH 229 1400 10 45
## 9 145 UA 1086 N76502 LGA IAH 248 1416 9 0
## 10 -2 UA 1461 N39418 EWR IAH 221 1400 12 0
## time_hour
## <dttm>
## 1 2013-01-01 05:00:00
## 2 2013-01-01 05:00:00
## 3 2013-01-01 06:00:00
## 4 2013-01-01 07:00:00
## 5 2013-01-01 07:00:00
## 6 2013-01-01 09:00:00
## 7 2013-01-01 10:00:00
## 8 2013-01-01 10:00:00
## 9 2013-01-01 09:00:00
## 10 2013-01-01 12:00:00
## # ... with 9,303 more rows
filter(flights, carrier %in% c("UA","AA","DL"))
## # A tibble: 139,504 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 554 600 -6 812 837
## 5 2013 1 1 554 558 -4 740 728
## 6 2013 1 1 558 600 -2 753 745
## 7 2013 1 1 558 600 -2 924 917
## 8 2013 1 1 558 600 -2 923 937
## 9 2013 1 1 559 600 -1 941 910
## 10 2013 1 1 559 600 -1 854 902
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15
## 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29
## 3 33 AA 1141 N619AA JFK MIA 160 1089 5 40
## 4 -25 DL 461 N668DN LGA ATL 116 762 6 0
## 5 12 UA 1696 N39463 EWR ORD 150 719 5 58
## 6 8 AA 301 N3ALAA LGA ORD 138 733 6 0
## 7 7 UA 194 N29129 JFK LAX 345 2475 6 0
## 8 -14 UA 1124 N53441 EWR SFO 361 2565 6 0
## 9 31 AA 707 N3DUAA LGA DFW 257 1389 6 0
## 10 -8 UA 1187 N76515 EWR LAS 337 2227 6 0
## time_hour
## <dttm>
## 1 2013-01-01 05:00:00
## 2 2013-01-01 05:00:00
## 3 2013-01-01 05:00:00
## 4 2013-01-01 06:00:00
## 5 2013-01-01 05:00:00
## 6 2013-01-01 06:00:00
## 7 2013-01-01 06:00:00
## 8 2013-01-01 06:00:00
## 9 2013-01-01 06:00:00
## 10 2013-01-01 06:00:00
## # ... with 139,494 more rows
filter(flights, month %in% c(7,8,9))
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 7 1 1 2029 212 236 2359
## 2 2013 7 1 2 2359 3 344 344
## 3 2013 7 1 29 2245 104 151 1
## 4 2013 7 1 43 2130 193 322 14
## 5 2013 7 1 44 2150 174 300 100
## 6 2013 7 1 46 2051 235 304 2358
## 7 2013 7 1 48 2001 287 308 2305
## 8 2013 7 1 58 2155 183 335 43
## 9 2013 7 1 100 2146 194 327 30
## 10 2013 7 1 100 2245 135 337 135
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 157 B6 915 N653JB JFK SFO 315 2586 20 29
## 2 0 B6 1503 N805JB JFK SJU 200 1598 23 59
## 3 110 B6 234 N348JB JFK BTV 66 266 22 45
## 4 188 B6 1371 N794JB LGA FLL 143 1076 21 30
## 5 120 AA 185 N324AA JFK LAX 297 2475 21 50
## 6 186 B6 165 N640JB JFK PDX 304 2454 20 51
## 7 243 VX 415 N627VA JFK LAX 298 2475 20 1
## 8 172 B6 425 N535JB JFK TPA 140 1005 21 55
## 9 177 B6 1183 N531JB JFK MCO 126 944 21 46
## 10 122 B6 623 N663JB JFK LAX 304 2475 22 45
## time_hour
## <dttm>
## 1 2013-07-01 20:00:00
## 2 2013-07-01 23:00:00
## 3 2013-07-01 22:00:00
## 4 2013-07-01 21:00:00
## 5 2013-07-01 21:00:00
## 6 2013-07-01 20:00:00
## 7 2013-07-01 20:00:00
## 8 2013-07-01 21:00:00
## 9 2013-07-01 21:00:00
## 10 2013-07-01 22:00:00
## # ... with 86,316 more rows
filter(flights, arr_delay > 120 & dep_delay <= 0)
## # A tibble: 29 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 27 1419 1420 -1 1754 1550
## 2 2013 10 7 1350 1350 0 1736 1526
## 3 2013 10 7 1357 1359 -2 1858 1654
## 4 2013 10 16 657 700 -3 1258 1056
## 5 2013 11 1 658 700 -2 1329 1015
## 6 2013 3 18 1844 1847 -3 39 2219
## 7 2013 4 17 1635 1640 -5 2049 1845
## 8 2013 4 18 558 600 -2 1149 850
## 9 2013 4 18 655 700 -5 1213 950
## 10 2013 5 22 1827 1830 -3 2217 2010
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 124 MQ 3728 N1EAMQ EWR ORD 135 719 14 20
## 2 130 EV 5181 N611QX LGA MSN 117 812 13 50
## 3 124 AA 1151 N3CMAA LGA DFW 192 1389 13 59
## 4 122 B6 3 N703JB JFK SJU 225 1598 7 0
## 5 194 VX 399 N629VA JFK LAX 336 2475 7 0
## 6 140 UA 389 N560UA JFK SFO 386 2586 18 47
## 7 124 MQ 4540 N721MQ LGA DTW 130 502 16 40
## 8 179 AA 707 N3EXAA LGA DFW 234 1389 6 0
## 9 143 AA 2083 N565AA EWR DFW 230 1372 7 0
## 10 127 MQ 4674 N518MQ LGA CLE 90 419 18 30
## time_hour
## <dttm>
## 1 2013-01-27 14:00:00
## 2 2013-10-07 13:00:00
## 3 2013-10-07 13:00:00
## 4 2013-10-16 07:00:00
## 5 2013-11-01 07:00:00
## 6 2013-03-18 18:00:00
## 7 2013-04-17 16:00:00
## 8 2013-04-18 06:00:00
## 9 2013-04-18 07:00:00
## 10 2013-05-22 18:00:00
## # ... with 19 more rows
filter(flights, dep_delay >= 60 & arr_delay < 30)
## # A tibble: 206 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 3 1850 1745 65 2148 2120
## 2 2013 1 3 1950 1845 65 2228 2227
## 3 2013 1 3 2015 1915 60 2135 2111
## 4 2013 1 6 1019 900 79 1558 1530
## 5 2013 1 7 1543 1430 73 1758 1735
## 6 2013 1 11 1020 920 60 1311 1245
## 7 2013 1 12 1706 1600 66 1949 1927
## 8 2013 1 12 1953 1845 68 2154 2137
## 9 2013 1 19 1456 1355 61 1636 1615
## 10 2013 1 21 1531 1430 61 1843 1815
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 28 AA 177 N332AA JFK SFO 314 2586 17 45
## 2 1 B6 91 N636JB JFK OAK 319 2576 18 45
## 3 24 9E 3525 N903XJ JFK ORD 110 740 19 15
## 4 28 HA 51 N385HA JFK HNL 611 4983 9 0
## 5 23 AA 883 N4XBAA EWR DFW 177 1372 14 30
## 6 26 AA 721 N434AA LGA DFW 201 1389 9 20
## 7 22 DL 2139 N982DL LGA MIA 141 1096 16 0
## 8 17 9E 3369 N919XJ JFK MSY 165 1182 18 45
## 9 21 EV 5469 N724EV LGA MEM 137 963 13 55
## 10 28 DL 2126 N710TW JFK SFO 345 2586 14 30
## time_hour
## <dttm>
## 1 2013-01-03 17:00:00
## 2 2013-01-03 18:00:00
## 3 2013-01-03 19:00:00
## 4 2013-01-06 09:00:00
## 5 2013-01-07 14:00:00
## 6 2013-01-11 09:00:00
## 7 2013-01-12 16:00:00
## 8 2013-01-12 18:00:00
## 9 2013-01-19 13:00:00
## 10 2013-01-21 14:00:00
## # ... with 196 more rows
filter(flights, is.na(dep_time))
## # A tibble: 8,255 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 NA 1630 NA NA 1815
## 2 2013 1 1 NA 1935 NA NA 2240
## 3 2013 1 1 NA 1500 NA NA 1825
## 4 2013 1 1 NA 600 NA NA 901
## 5 2013 1 2 NA 1540 NA NA 1747
## 6 2013 1 2 NA 1620 NA NA 1746
## 7 2013 1 2 NA 1355 NA NA 1459
## 8 2013 1 2 NA 1420 NA NA 1644
## 9 2013 1 2 NA 1321 NA NA 1536
## 10 2013 1 2 NA 1545 NA NA 1910
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 NA EV 4308 N18120 EWR RDU NA 416 16 30
## 2 NA AA 791 N3EHAA LGA DFW NA 1389 19 35
## 3 NA AA 1925 N3EVAA LGA MIA NA 1096 15 0
## 4 NA B6 125 N618JB JFK FLL NA 1069 6 0
## 5 NA EV 4352 N10575 EWR CVG NA 569 15 40
## 6 NA EV 4406 N13949 EWR PIT NA 319 16 20
## 7 NA EV 4434 N10575 EWR MHT NA 209 13 55
## 8 NA EV 4935 N759EV EWR ATL NA 746 14 20
## 9 NA EV 3849 N13550 EWR IND NA 645 13 21
## 10 NA AA 133 <NA> JFK LAX NA 2475 15 45
## time_hour
## <dttm>
## 1 2013-01-01 16:00:00
## 2 2013-01-01 19:00:00
## 3 2013-01-01 15:00:00
## 4 2013-01-01 06:00:00
## 5 2013-01-02 15:00:00
## 6 2013-01-02 16:00:00
## 7 2013-01-02 13:00:00
## 8 2013-01-02 14:00:00
## 9 2013-01-02 13:00:00
## 10 2013-01-02 15:00:00
## # ... with 8,245 more rows
Arrival time (arr_time) is also missing for these flights along with departure time (dep_time). This could represent that the flights were either canceled or rescheduled
arrange(flights, desc(is.na(dep_time)))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 NA 1630 NA NA 1815
## 2 2013 1 1 NA 1935 NA NA 2240
## 3 2013 1 1 NA 1500 NA NA 1825
## 4 2013 1 1 NA 600 NA NA 901
## 5 2013 1 2 NA 1540 NA NA 1747
## 6 2013 1 2 NA 1620 NA NA 1746
## 7 2013 1 2 NA 1355 NA NA 1459
## 8 2013 1 2 NA 1420 NA NA 1644
## 9 2013 1 2 NA 1321 NA NA 1536
## 10 2013 1 2 NA 1545 NA NA 1910
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 NA EV 4308 N18120 EWR RDU NA 416 16 30
## 2 NA AA 791 N3EHAA LGA DFW NA 1389 19 35
## 3 NA AA 1925 N3EVAA LGA MIA NA 1096 15 0
## 4 NA B6 125 N618JB JFK FLL NA 1069 6 0
## 5 NA EV 4352 N10575 EWR CVG NA 569 15 40
## 6 NA EV 4406 N13949 EWR PIT NA 319 16 20
## 7 NA EV 4434 N10575 EWR MHT NA 209 13 55
## 8 NA EV 4935 N759EV EWR ATL NA 746 14 20
## 9 NA EV 3849 N13550 EWR IND NA 645 13 21
## 10 NA AA 133 <NA> JFK LAX NA 2475 15 45
## time_hour
## <dttm>
## 1 2013-01-01 16:00:00
## 2 2013-01-01 19:00:00
## 3 2013-01-01 15:00:00
## 4 2013-01-01 06:00:00
## 5 2013-01-02 15:00:00
## 6 2013-01-02 16:00:00
## 7 2013-01-02 13:00:00
## 8 2013-01-02 14:00:00
## 9 2013-01-02 13:00:00
## 10 2013-01-02 15:00:00
## # ... with 336,766 more rows
arrange(flights, desc(is.na(arr_time)))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 2016 1930 46 NA 2220
## 2 2013 1 1 NA 1630 NA NA 1815
## 3 2013 1 1 NA 1935 NA NA 2240
## 4 2013 1 1 NA 1500 NA NA 1825
## 5 2013 1 1 NA 600 NA NA 901
## 6 2013 1 2 2041 2045 -4 NA 2359
## 7 2013 1 2 2145 2129 16 NA 33
## 8 2013 1 2 NA 1540 NA NA 1747
## 9 2013 1 2 NA 1620 NA NA 1746
## 10 2013 1 2 NA 1355 NA NA 1459
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 NA EV 4204 N14168 EWR OKC NA 1325 19 30
## 2 NA EV 4308 N18120 EWR RDU NA 416 16 30
## 3 NA AA 791 N3EHAA LGA DFW NA 1389 19 35
## 4 NA AA 1925 N3EVAA LGA MIA NA 1096 15 0
## 5 NA B6 125 N618JB JFK FLL NA 1069 6 0
## 6 NA B6 147 N630JB JFK RSW NA 1074 20 45
## 7 NA UA 1299 N12221 EWR RSW NA 1068 21 29
## 8 NA EV 4352 N10575 EWR CVG NA 569 15 40
## 9 NA EV 4406 N13949 EWR PIT NA 319 16 20
## 10 NA EV 4434 N10575 EWR MHT NA 209 13 55
## time_hour
## <dttm>
## 1 2013-01-01 19:00:00
## 2 2013-01-01 16:00:00
## 3 2013-01-01 19:00:00
## 4 2013-01-01 15:00:00
## 5 2013-01-01 06:00:00
## 6 2013-01-02 20:00:00
## 7 2013-01-02 21:00:00
## 8 2013-01-02 15:00:00
## 9 2013-01-02 16:00:00
## 10 2013-01-02 13:00:00
## # ... with 336,766 more rows
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1272 HA 51 N384HA JFK HNL 640 4983 9 0
## 2 1127 MQ 3535 N504MQ JFK CMH 74 483 19 35
## 3 1109 MQ 3695 N517MQ EWR ORD 111 719 16 35
## 4 1007 AA 177 N338AA JFK SFO 354 2586 18 45
## 5 989 MQ 3075 N665MQ JFK CVG 96 589 16 0
## 6 931 DL 2391 N959DL JFK TPA 139 1005 19 0
## 7 915 DL 2119 N927DA LGA MSP 167 1020 8 10
## 8 850 DL 2007 N3762Y JFK PDX 313 2454 19 0
## 9 895 DL 2047 N6716C LGA ATL 109 762 7 59
## 10 878 AA 172 N5DMAA EWR MIA 149 1085 17 0
## time_hour
## <dttm>
## 1 2013-01-09 09:00:00
## 2 2013-06-15 19:00:00
## 3 2013-01-10 16:00:00
## 4 2013-09-20 18:00:00
## 5 2013-07-22 16:00:00
## 6 2013-04-10 19:00:00
## 7 2013-03-17 08:00:00
## 8 2013-06-27 19:00:00
## 9 2013-07-22 07:00:00
## 10 2013-12-05 17:00:00
## # ... with 336,766 more rows
arrange(flights, dep_delay)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 12 7 2040 2123 -43 40 2352
## 2 2013 2 3 2022 2055 -33 2240 2338
## 3 2013 11 10 1408 1440 -32 1549 1559
## 4 2013 1 11 1900 1930 -30 2233 2243
## 5 2013 1 29 1703 1730 -27 1947 1957
## 6 2013 8 9 729 755 -26 1002 955
## 7 2013 10 23 1907 1932 -25 2143 2143
## 8 2013 3 30 2030 2055 -25 2213 2250
## 9 2013 3 2 1431 1455 -24 1601 1631
## 10 2013 5 5 934 958 -24 1225 1309
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 48 B6 97 N592JB JFK DEN 265 1626 21 23
## 2 -58 DL 1715 N612DL LGA MSY 162 1183 20 55
## 3 -10 EV 5713 N825AS LGA IAD 52 229 14 40
## 4 -10 DL 1435 N934DL LGA TPA 139 1010 19 30
## 5 -10 F9 837 N208FR LGA DEN 250 1620 17 30
## 6 7 MQ 3478 N711MQ LGA DTW 88 502 7 55
## 7 0 EV 4361 N13994 EWR TYS 111 631 19 32
## 8 -37 MQ 4573 N725MQ LGA DTW 87 502 20 55
## 9 -30 9E 3318 N929XJ JFK BUF 55 301 14 55
## 10 -44 B6 375 N531JB LGA FLL 150 1076 9 58
## time_hour
## <dttm>
## 1 2013-12-07 21:00:00
## 2 2013-02-03 20:00:00
## 3 2013-11-10 14:00:00
## 4 2013-01-11 19:00:00
## 5 2013-01-29 17:00:00
## 6 2013-08-09 07:00:00
## 7 2013-10-23 19:00:00
## 8 2013-03-30 20:00:00
## 9 2013-03-02 14:00:00
## 10 2013-05-05 09:00:00
## # ... with 336,766 more rows
Flight that left earliest was B6 carrier flight 97 and HA carrier flight 51 was the most delayed.
vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, any_of(vars))
## # A tibble: 336,776 x 5
## year month day dep_delay arr_delay
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 2 11
## 2 2013 1 1 4 20
## 3 2013 1 1 2 33
## 4 2013 1 1 -1 -18
## 5 2013 1 1 -6 -25
## 6 2013 1 1 -4 12
## 7 2013 1 1 -5 19
## 8 2013 1 1 -3 -14
## 9 2013 1 1 -3 -8
## 10 2013 1 1 -2 8
## # ... with 336,766 more rows
any_of() and all_of() functions have replaced one_of function in R. all_of() will throw an error if even one of the variables in missing, but any_of() will ignore any missing variables present in the data frame.
flightsairtime <- mutate(flights,
dep_time = (dep_time %/% 100*60 + dep_time %% 100) %% 1440,
arr_time = (arr_time %/% 100*60 + arr_time %% 100) %% 1440,
)
nrow(filter(flightsairtime, air_time != arr_time - dep_time ))
## [1] 327150
I expected to see that air_time = arr_time - dep_time. However, I see that there are 327150 rows where airtime != arr_time - dep_time. Time zone differences could be causing this difference. Also, another reason could be flights crossing over midnight to go to the destination and that means it is next day.
delayed_flights <- mutate(flights, minrank_dep_delay = min_rank(desc(dep_delay)))
delayed_flights <- arrange(delayed_flights, minrank_dep_delay)
head(delayed_flights, 10)
## # A tibble: 10 x 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1272 HA 51 N384HA JFK HNL 640 4983 9 0
## 2 1127 MQ 3535 N504MQ JFK CMH 74 483 19 35
## 3 1109 MQ 3695 N517MQ EWR ORD 111 719 16 35
## 4 1007 AA 177 N338AA JFK SFO 354 2586 18 45
## 5 989 MQ 3075 N665MQ JFK CVG 96 589 16 0
## 6 931 DL 2391 N959DL JFK TPA 139 1005 19 0
## 7 915 DL 2119 N927DA LGA MSP 167 1020 8 10
## 8 850 DL 2007 N3762Y JFK PDX 313 2454 19 0
## 9 895 DL 2047 N6716C LGA ATL 109 762 7 59
## 10 878 AA 172 N5DMAA EWR MIA 149 1085 17 0
## time_hour minrank_dep_delay
## <dttm> <int>
## 1 2013-01-09 09:00:00 1
## 2 2013-06-15 19:00:00 2
## 3 2013-01-10 16:00:00 3
## 4 2013-09-20 18:00:00 4
## 5 2013-07-22 16:00:00 5
## 6 2013-04-10 19:00:00 6
## 7 2013-03-17 08:00:00 7
## 8 2013-06-27 19:00:00 8
## 9 2013-07-22 07:00:00 9
## 10 2013-12-05 17:00:00 10
1:3 + 1:10
## Warning in 1:3 + 1:10: longer object length is not a multiple of shorter object
## length
## [1] 2 4 6 5 7 9 8 10 12 11
flights %>%
group_by(carrier) %>%
summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(arr_delay))
## # A tibble: 16 x 2
## carrier arr_delay
## <chr> <dbl>
## 1 F9 21.9
## 2 FL 20.1
## 3 EV 15.8
## 4 YV 15.6
## 5 OO 11.9
## 6 MQ 10.8
## 7 WN 9.65
## 8 B6 9.46
## 9 9E 7.38
## 10 UA 3.56
## 11 US 2.13
## 12 VX 1.76
## 13 DL 1.64
## 14 AA 0.364
## 15 HA -6.92
## 16 AS -9.93
flights %>%
count(carrier, sort = TRUE)
## # A tibble: 16 x 2
## carrier n
## <chr> <int>
## 1 UA 58665
## 2 B6 54635
## 3 EV 54173
## 4 DL 48110
## 5 AA 32729
## 6 MQ 26397
## 7 US 20536
## 8 9E 18460
## 9 WN 12275
## 10 VX 5162
## 11 FL 3260
## 12 AS 714
## 13 F9 685
## 14 YV 601
## 15 HA 342
## 16 OO 32
Sort argument to count() sorts the output, that is, the number of occurrences from highest to lowest. Sort argument in this code is sorting the carriers by the number of times it appears in the dataframe. It is useful when we want to identify which row is appearing often in the dataframe.
flights %>%
group_by(tailnum) %>%
filter(dep_delay > 0) %>%
summarise(avg_delay = mean(dep_delay)) %>%
arrange(desc(avg_delay))
## # A tibble: 3,885 x 2
## tailnum avg_delay
## <chr> <dbl>
## 1 N844MH 297
## 2 N452UW 291
## 3 N922EV 274
## 4 N587NW 272
## 5 N911DA 268
## 6 N665MQ 268.
## 7 N673MQ 257
## 8 N851NW 233
## 9 N654UA 227
## 10 N550NW 212.
## # ... with 3,875 more rows
Based on the above results, N844MH has worst on-time record.
flights %>%
filter(arr_delay > 0) %>%
group_by(dest) %>%
mutate(sum_arr_delay = sum(arr_delay),
proportion_arr_delay = arr_delay / sum_arr_delay
) %>%
select(dest, month, day, dep_time, carrier, flight,
arr_delay, proportion_arr_delay) %>%
arrange(dest, desc(proportion_arr_delay))
## # A tibble: 133,004 x 8
## # Groups: dest [103]
## dest month day dep_time carrier flight arr_delay proportion_arr_delay
## <chr> <int> <int> <int> <chr> <int> <dbl> <dbl>
## 1 ABQ 7 22 2145 B6 1505 153 0.0341
## 2 ABQ 12 14 2223 B6 65 149 0.0332
## 3 ABQ 10 15 2146 B6 65 138 0.0308
## 4 ABQ 7 23 2206 B6 1505 137 0.0305
## 5 ABQ 12 17 2220 B6 65 136 0.0303
## 6 ABQ 7 10 2025 B6 1505 126 0.0281
## 7 ABQ 7 30 2212 B6 1505 118 0.0263
## 8 ABQ 7 28 2038 B6 1505 117 0.0261
## 9 ABQ 12 8 2049 B6 65 114 0.0254
## 10 ABQ 9 2 2212 B6 1505 109 0.0243
## # ... with 132,994 more rows
flights %>%
group_by(dest) %>%
mutate(num_carriers = n_distinct(carrier)) %>%
filter(num_carriers >= 2) %>%
group_by(carrier) %>%
summarise(num_dest = n_distinct(dest)) %>%
arrange(desc(num_dest))
## # A tibble: 16 x 2
## carrier num_dest
## <chr> <int>
## 1 EV 51
## 2 9E 48
## 3 UA 42
## 4 DL 39
## 5 B6 35
## 6 AA 19
## 7 MQ 19
## 8 WN 10
## 9 OO 5
## 10 US 5
## 11 VX 4
## 12 YV 3
## 13 FL 2
## 14 AS 1
## 15 F9 1
## 16 HA 1