#To clear environment and re-run from the start...
#rm(list = ls())
#install.packages('nycflights13')
library(nycflights13)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Note: you will probablly have to install nycflights13 using install.packages and the load it with the library command. nycflights13 is a relational database containig the following tables (data frames). This is data about all airline flights into and out of New York City in 2021. This project will parallel Chapter 5 in Wickham and Hadley
| data frame | description |
|---|---|
| airlines | Airline names |
| airports | Airport metadata |
| flights | Flights data |
| planes | Planes meta data |
| weather | Hourly data |
This is data about all airline flights into and out of New York City in 2021. This project will parallel Chapter 5 in Wickham and Hadley. You should start reading this chapter now, and try to complete reading it by the end of this week.
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#******************************************************************************#
nycflights13 if necessary.Write up your solutions to the exercises in 5.2.4 in this document, including the code chunks you use to determine the answer.
1-1. Had arrival delay of 2 or more hours
filter(flights, arr_delay >= 120) -> long_delay
long_delay <- tibble(long_delay)
long_delay
## # A tibble: 10,200 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 811 630 101 1047 830
## 2 2013 1 1 848 1835 853 1001 1950
## 3 2013 1 1 957 733 144 1056 853
## 4 2013 1 1 1114 900 134 1447 1222
## 5 2013 1 1 1505 1310 115 1638 1431
## 6 2013 1 1 1525 1340 105 1831 1626
## 7 2013 1 1 1549 1445 64 1912 1656
## 8 2013 1 1 1558 1359 119 1718 1515
## 9 2013 1 1 1732 1630 62 2028 1825
## 10 2013 1 1 1803 1620 103 2008 1750
## # ... with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
ld_count <- count(long_delay)
ld_count
## # A tibble: 1 x 1
## n
## <int>
## 1 10200
1-2. Flew to Houston
filter(flights, (dest == 'IAH' | dest == 'HOU')) -> to_houston
to_houston <- tibble(to_houston)
to_houston
## # A tibble: 9,313 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 623 627 -4 933 932
## 4 2013 1 1 728 732 -4 1041 1038
## 5 2013 1 1 739 739 0 1104 1038
## 6 2013 1 1 908 908 0 1228 1219
## 7 2013 1 1 1028 1026 2 1350 1339
## 8 2013 1 1 1044 1045 -1 1352 1351
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1205 1200 5 1503 1505
## # ... with 9,303 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
hou_count <- count(to_houston)
hou_count
## # A tibble: 1 x 1
## n
## <int>
## 1 9313
1-3. Carrier was United, American, or Delta
filter(flights, (carrier == 'DL' | carrier == 'AA' | carrier == 'UA')) -> car_type
car_type <- tibble(car_type)
car_type
## # A tibble: 139,504 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 554 600 -6 812 837
## 5 2013 1 1 554 558 -4 740 728
## 6 2013 1 1 558 600 -2 753 745
## 7 2013 1 1 558 600 -2 924 917
## 8 2013 1 1 558 600 -2 923 937
## 9 2013 1 1 559 600 -1 941 910
## 10 2013 1 1 559 600 -1 854 902
## # ... with 139,494 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
type_count <- count(car_type)
type_count
## # A tibble: 1 x 1
## n
## <int>
## 1 139504
1-4. Summer departure (July, Aug, Sep)
filter(flights, (month == 7 | month == 8 | month == 9)) -> sum_dpt
sum_dpt <- tibble(sum_dpt)
sum_dpt
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 7 1 1 2029 212 236 2359
## 2 2013 7 1 2 2359 3 344 344
## 3 2013 7 1 29 2245 104 151 1
## 4 2013 7 1 43 2130 193 322 14
## 5 2013 7 1 44 2150 174 300 100
## 6 2013 7 1 46 2051 235 304 2358
## 7 2013 7 1 48 2001 287 308 2305
## 8 2013 7 1 58 2155 183 335 43
## 9 2013 7 1 100 2146 194 327 30
## 10 2013 7 1 100 2245 135 337 135
## # ... with 86,316 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
sum_count <- count(sum_dpt)
sum_count
## # A tibble: 1 x 1
## n
## <int>
## 1 86326
1-5. Arrived more than 2 hours late but did not leave late
filter(flights, (arr_delay >= 120 & dep_delay <= 0)) -> late_no_delay
late_no_delay <- tibble(late_no_delay)
late_no_delay
## # A tibble: 29 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 27 1419 1420 -1 1754 1550
## 2 2013 10 7 1350 1350 0 1736 1526
## 3 2013 10 7 1357 1359 -2 1858 1654
## 4 2013 10 16 657 700 -3 1258 1056
## 5 2013 11 1 658 700 -2 1329 1015
## 6 2013 3 18 1844 1847 -3 39 2219
## 7 2013 4 17 1635 1640 -5 2049 1845
## 8 2013 4 18 558 600 -2 1149 850
## 9 2013 4 18 655 700 -5 1213 950
## 10 2013 5 22 1827 1830 -3 2217 2010
## # ... with 19 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
late_nd_count <- count(late_no_delay)
late_nd_count
## # A tibble: 1 x 1
## n
## <int>
## 1 29
1-6. Delayed at least 1 hour but made up at least 30 min in flight
filter(flights, (dep_delay >= 60 & (arr_delay - dep_delay) <= -30)) -> late_made30
late_made30 <- tibble(late_made30)
late_made30
## # A tibble: 2,074 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 1716 1545 91 2140 2039
## 2 2013 1 1 2205 1720 285 46 2040
## 3 2013 1 1 2326 2130 116 131 18
## 4 2013 1 3 1503 1221 162 1803 1555
## 5 2013 1 3 1821 1530 171 2131 1910
## 6 2013 1 3 1839 1700 99 2056 1950
## 7 2013 1 3 1850 1745 65 2148 2120
## 8 2013 1 3 1923 1815 68 2036 1958
## 9 2013 1 3 1941 1759 102 2246 2139
## 10 2013 1 3 1950 1845 65 2228 2227
## # ... with 2,064 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
late_made30_count <- count(late_made30)
late_made30_count
## # A tibble: 1 x 1
## n
## <int>
## 1 2074
1-7. Departed between midnight and 6 a.m.
filter(flights, (dep_time == 2400 | dep_time <= 600)) -> late_no_delay #dataset considers midnight 2400 rather than 0000
late_no_delay <- tibble(late_no_delay)
late_no_delay
## # A tibble: 9,373 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 9,363 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
late_nd_count <- count(late_no_delay)
late_nd_count
## # A tibble: 1 x 1
## n
## <int>
## 1 9373
#between takes a starting value and an ending value and returns all the values greater than the starting value
#and smaller than the ending value (inclusive)
#between() could be used for question 1-4, to get the months between(month, 7, 9)
#IF 0000 was midnight instead of 2400, it also could have been used there between(dep_time, 0, 600)
filter(flights, is.na(dep_time)) -> dep_missing
dep_missing
## # A tibble: 8,255 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 NA 1630 NA NA 1815
## 2 2013 1 1 NA 1935 NA NA 2240
## 3 2013 1 1 NA 1500 NA NA 1825
## 4 2013 1 1 NA 600 NA NA 901
## 5 2013 1 2 NA 1540 NA NA 1747
## 6 2013 1 2 NA 1620 NA NA 1746
## 7 2013 1 2 NA 1355 NA NA 1459
## 8 2013 1 2 NA 1420 NA NA 1644
## 9 2013 1 2 NA 1321 NA NA 1536
## 10 2013 1 2 NA 1545 NA NA 1910
## # ... with 8,245 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
count(dep_missing)
## # A tibble: 1 x 1
## n
## <int>
## 1 8255
#since these flights also have missing arrive time information etc., it likely indicates canceled fligths
#NA^0 -- any value to the 0 is always 1
#NA | TRUE -- or returns true as long as one is true. TRUE will always be TRUE
#FALSE & NA -- and needs both to be true, otherwise it is false. A value cannot be both missing and false
#so this statement will always be FALSE
#NA * 0 -- is attempting to multiply nothing by 0, this still leave nothing there.
Write up your solutions to the exercises in 5.3.1 in this document, including the code chunks you use to determine the answer.
arrange(flights, desc(is.na(flights$year)), desc(is.na(flights$month)), desc(is.na(flights$day)), desc(is.na(flights$dep_time)), desc(is.na(flights$sched_dep_time)), desc(is.na(flights$dep_delay)), desc(is.na(flights$arr_time)), desc(is.na(flights$sched_arr_time)), desc(is.na(flights$arr_delay)), desc(is.na(flights$carrier)), desc(is.na(flights$flight)), desc(is.na(flights$air_time)), desc(is.na(flights$distance)), desc(is.na(flights$hour)), desc(is.na(flights$minute)), desc(is.na(flights$time_hour))) -> no_data
no_data
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 NA 1630 NA NA 1815
## 2 2013 1 1 NA 1935 NA NA 2240
## 3 2013 1 1 NA 1500 NA NA 1825
## 4 2013 1 1 NA 600 NA NA 901
## 5 2013 1 2 NA 1540 NA NA 1747
## 6 2013 1 2 NA 1620 NA NA 1746
## 7 2013 1 2 NA 1355 NA NA 1459
## 8 2013 1 2 NA 1420 NA NA 1644
## 9 2013 1 2 NA 1321 NA NA 1536
## 10 2013 1 2 NA 1545 NA NA 1910
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
late <- arrange(flights, desc(dep_delay))
late
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
early <- arrange(flights, dep_delay)
early
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 12 7 2040 2123 -43 40 2352
## 2 2013 2 3 2022 2055 -33 2240 2338
## 3 2013 11 10 1408 1440 -32 1549 1559
## 4 2013 1 11 1900 1930 -30 2233 2243
## 5 2013 1 29 1703 1730 -27 1947 1957
## 6 2013 8 9 729 755 -26 1002 955
## 7 2013 10 23 1907 1932 -25 2143 2143
## 8 2013 3 30 2030 2055 -25 2213 2250
## 9 2013 3 2 1431 1455 -24 1601 1631
## 10 2013 5 5 934 958 -24 1225 1309
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
fastest <- arrange(flights, desc(distance / (air_time/60)))
fastest
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 5 25 1709 1700 9 1923 1937
## 2 2013 7 2 1558 1513 45 1745 1719
## 3 2013 5 13 2040 2025 15 2225 2226
## 4 2013 3 23 1914 1910 4 2045 2043
## 5 2013 1 12 1559 1600 -1 1849 1917
## 6 2013 11 17 650 655 -5 1059 1150
## 7 2013 2 21 2355 2358 -3 412 438
## 8 2013 11 17 759 800 -1 1212 1255
## 9 2013 11 16 2003 1925 38 17 36
## 10 2013 11 16 2349 2359 -10 402 440
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
farthest <- arrange(flights, desc(distance))
farthest
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 857 900 -3 1516 1530
## 2 2013 1 2 909 900 9 1525 1530
## 3 2013 1 3 914 900 14 1504 1530
## 4 2013 1 4 900 900 0 1516 1530
## 5 2013 1 5 858 900 -2 1519 1530
## 6 2013 1 6 1019 900 79 1558 1530
## 7 2013 1 7 1042 900 102 1620 1530
## 8 2013 1 8 901 900 1 1504 1530
## 9 2013 1 9 641 900 1301 1242 1530
## 10 2013 1 10 859 900 -1 1449 1530
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
shortest <- arrange(flights, distance)
shortest
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 7 27 NA 106 NA NA 245
## 2 2013 1 3 2127 2129 -2 2222 2224
## 3 2013 1 4 1240 1200 40 1333 1306
## 4 2013 1 4 1829 1615 134 1937 1721
## 5 2013 1 4 2128 2129 -1 2218 2224
## 6 2013 1 5 1155 1200 -5 1241 1306
## 7 2013 1 6 2125 2129 -4 2224 2224
## 8 2013 1 7 2124 2129 -5 2212 2224
## 9 2013 1 8 2127 2130 -3 2304 2225
## 10 2013 1 9 2126 2129 -3 2217 2224
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Write up your solutions to the exercises in 5.3.1 in this document, including the code chunks you use to determine the answer.
select(flights, dep_time, dep_delay, arr_time, arr_delay)
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
select(flights, dep_time:arr_delay, -c(sched_dep_time, sched_arr_time))
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
select(flights, -(year:day), -c(sched_dep_time, sched_arr_time), -(carrier:time_hour))
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
#It appears that the duplicated is ignored
select(flights, dep_time, dep_time, arr_time)
## # A tibble: 336,776 x 2
## dep_time arr_time
## <int> <int>
## 1 517 830
## 2 533 850
## 3 542 923
## 4 544 1004
## 5 554 812
## 6 554 740
## 7 555 913
## 8 557 709
## 9 557 838
## 10 558 753
## # ... with 336,766 more rows
vars <- c(“year”, “month”, “day”, “dep_delay”, “arr_delay”)
#It selects any of the columns found in the vector and ignores the values of the vector that are not found in the data set.
#If one of the values in the vector, 'vars', was not a column name using any_of would stop an error from being thrown.
vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, any_of(vars))
## # A tibble: 336,776 x 5
## year month day dep_delay arr_delay
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 2 11
## 2 2013 1 1 4 20
## 3 2013 1 1 2 33
## 4 2013 1 1 -1 -18
## 5 2013 1 1 -6 -25
## 6 2013 1 1 -4 12
## 7 2013 1 1 -5 19
## 8 2013 1 1 -3 -14
## 9 2013 1 1 -3 -8
## 10 2013 1 1 -2 8
## # ... with 336,766 more rows
select(flights, contains(“TIME”))
#It appeas that, by default, the helpers turn everything into lowercase and look for matches.
#To change this default, one can use the following argument:
#contains(match, ignore.case = FALSE)
#Source:
#https://www.r-bloggers.com/2015/07/the-complete-catalog-of-argument-variations-of-select-in-dplyr/
select(flights, contains("TIME"))
## # A tibble: 336,776 x 6
## dep_time sched_dep_time arr_time sched_arr_time air_time time_hour
## <int> <int> <int> <int> <dbl> <dttm>
## 1 517 515 830 819 227 2013-01-01 05:00:00
## 2 533 529 850 830 227 2013-01-01 05:00:00
## 3 542 540 923 850 160 2013-01-01 05:00:00
## 4 544 545 1004 1022 183 2013-01-01 05:00:00
## 5 554 600 812 837 116 2013-01-01 06:00:00
## 6 554 558 740 728 150 2013-01-01 05:00:00
## 7 555 600 913 854 158 2013-01-01 06:00:00
## 8 557 600 709 723 53 2013-01-01 06:00:00
## 9 557 600 838 846 140 2013-01-01 06:00:00
## 10 558 600 753 745 138 2013-01-01 06:00:00
## # ... with 336,766 more rows
select(flights, contains("TIME", ignore.case = FALSE))
## # A tibble: 336,776 x 0
#To clear and run again...
#rm(flights_updated)
Write up your solutions to the exercises in 5.5.2 in this document, including the code chunks you use to determine the answer.
#convert dep_time and arr_time to minutes since midnight
mutate(flights, dep_mins = (dep_time %/% 100 * 60) + (dep_time %% 100)) %>%
mutate(flights, arr_mins = (arr_time %/% 100 * 60) + (arr_time %% 100)) -> flights_updated
#Replace any midnight departure or arrivals (min value of 1440) with 0 (mins since midnight)
replace(flights_updated$dep_mins, flights_updated$dep_mins == 1440, 0) -> flights_updated$dep_mins
replace(flights_updated$arr_mins, flights_updated$arr_mins == 1440, 0) -> flights_updated$arr_mins
#results
#flights #original data set
flights_updated #updated data set shows new columns
## # A tibble: 336,776 x 21
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 336,766 more rows, and 13 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # dep_mins <dbl>, arr_mins <dbl>
filter(flights_updated, dep_mins == 0) #check that all 2400 changed to 0
## # A tibble: 29 x 21
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 10 30 2400 2359 1 327 337
## 2 2013 11 27 2400 2359 1 515 445
## 3 2013 12 5 2400 2359 1 427 440
## 4 2013 12 9 2400 2359 1 432 440
## 5 2013 12 9 2400 2250 70 59 2356
## 6 2013 12 13 2400 2359 1 432 440
## 7 2013 12 19 2400 2359 1 434 440
## 8 2013 12 29 2400 1700 420 302 2025
## 9 2013 2 7 2400 2359 1 432 436
## 10 2013 2 7 2400 2359 1 443 444
## # ... with 19 more rows, and 13 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, dep_mins <dbl>,
## # arr_mins <dbl>
filter(flights_updated, arr_mins == 0) #check that all 2400 changed to 0
## # A tibble: 150 x 21
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 2209 2155 14 2400 2337
## 2 2013 1 5 2116 2130 -14 2400 18
## 3 2013 1 13 2243 2129 74 2400 2224
## 4 2013 1 16 2138 2107 31 2400 2322
## 5 2013 1 17 2256 2249 7 2400 2357
## 6 2013 1 22 2212 2055 77 2400 2250
## 7 2013 1 22 2249 2125 84 2400 2250
## 8 2013 1 25 2055 1725 210 2400 1933
## 9 2013 1 28 2303 2250 13 2400 2354
## 10 2013 1 30 2155 1915 160 2400 2137
## # ... with 140 more rows, and 13 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # dep_mins <dbl>, arr_mins <dbl>
#flights
#I expect this to equal TRUE
#print((flights$arr_time - flights$dep_time) - flights$air_time == 0)
#but in fact, all are FALSE
#This is because arr_time and dep_time are in HHMM, not pure minutes
#must use converted dep_mins and arr_mins)
#print((flights_updated$arr_mins - flights_updated$dep_mins) - flights_updated$air_time == 0)
#This still returns all FALSE
#Upon examining the info on the data set, air time does not include time on the runway etc.
#To get the desired info, I must calculate a new column, which is total time of flight
#abs(arr_min - dep_mins) -- abs() because midnight arrivals are 0 and not 1440 now.
flights_updated <- mutate(flights_updated, total_time = abs(arr_mins - dep_mins))
flights_updated
## # A tibble: 336,776 x 22
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 336,766 more rows, and 14 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # dep_mins <dbl>, arr_mins <dbl>, total_time <dbl>
#however, for some flights, total_time is now less than air_time
#e.g. for the first entry, total_time is 193 and air_time is 227.
#This is due to arrival time actually being in the local time zone.
#So, total time must be adjusted for based on time zone.
#This would require running a function that checks each airport's timezone and add to total time 60, 120, or 180 minutes.
#It also would require checking for flights that left before and arrived after midnight.
#I would expect dep_time - sched_dep_time = dep_delay
#print((flights$dep_time - flights$sched_dep_time) == flights$dep_delay)
#Instead, only those flights that left on time are TRUE
#This is also due to the same issue that dep_time and arr_time are in HHMM, not pure minutes
#Both need to be converted to pure minutes since midnight using mutate()
#I've already done this for dep_mins, so I will do it for sched_dep_mins
mutate(flights_updated, sched_dep_mins = (sched_dep_time %/% 100 * 60) + (sched_dep_time %% 100)) -> flights_updated
replace(flights_updated$sched_dep_mins, flights_updated$sched_dep_mins == 1440, 0) -> flights_updated$sched_dep_mins
#print((flights_updated$dep_mins - flights_updated$sched_dep_mins) == flights_updated$dep_delay)
#There still are a few FALSEs
#in all cases, these were for flight whose delay pushed them past midnight into the next day
#filter(flights_updated, dep_mins - sched_dep_mins != dep_delay)
#This filter selects all flights were this was the case
next_day <- filter(flights_updated, dep_mins < sched_dep_mins & dep_delay > 0)
next_day
## # A tibble: 1,236 x 23
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 848 1835 853 1001 1950
## 2 2013 1 2 42 2359 43 518 442
## 3 2013 1 2 126 2250 156 233 2359
## 4 2013 1 3 32 2359 33 504 442
## 5 2013 1 3 50 2145 185 203 2311
## 6 2013 1 3 235 2359 156 700 437
## 7 2013 1 4 25 2359 26 505 442
## 8 2013 1 4 106 2245 141 201 2356
## 9 2013 1 5 14 2359 15 503 445
## 10 2013 1 5 37 2230 127 341 131
## # ... with 1,226 more rows, and 15 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # dep_mins <dbl>, arr_mins <dbl>, total_time <dbl>, sched_dep_mins <dbl>
…
#To clear and run again...
#rm(flights2)
#rm(flights3)
…
#make a copy for testing
flights2 <- flights_updated
…
#Now I can replace the dep_mins with dep_mins + 1440 (an extra 24 hours, i.e the next day)
#flights2$dep_mins[flights2$dep_mins < flights2$sched_dep_mins & flights2$dep_delay > 0] <- flights2$dep_mins + 1440
#flights2
#This gave me an error that NAs are not allowed in subscripted assignemnts.
#There are a few NAs that I need to address
sum(is.na(next_day))
## [1] 22
colSums(is.na(next_day))
## year month day dep_time sched_dep_time
## 0 0 0 0 0
## dep_delay arr_time sched_arr_time arr_delay carrier
## 0 2 0 8 0
## flight tailnum origin dest air_time
## 0 0 0 0 8
## distance hour minute time_hour dep_mins
## 0 0 0 0 0
## arr_mins total_time sched_dep_mins
## 2 2 0
#However, I am not doing calculations/making changes on any data that has NA as its value
#so I don't understand why it is an issue
#I am only working with dep_mins, sched_dep_mins, and dep_delay
#filter(next_day, is.na(next_day$dep_mins) | is.na(next_day$sched_dep_mins) | is.na(next_day$dep_delay))
#filter(next_day, is.na(next_day$arr_delay))
#################################################################################
#I tried creating a new column, temp_dep_time that added 1440 to all dep_times
#Then I would match where flights were delayed into the next day and swapped dep_time for temp_dep_time
#But this also ran into the subcsripted NAs error.
#Essentially the exact same error as before
#mutate(flights2, temp_dep_mins = dep_mins + 1440) -> flights3
#flights3
#flights3$dep_mins[flights3$dep_mins < flights3$sched_dep_mins & flights3$dep_delay > 0] <- flights3$temp_dep_mins
#flights3
#At this point, I don't know how to fix it unless I drop all record with any NAs, or assign some value like -1 to all NAs. Neither option seems like a good one for this data set and the questions being examined.
#?min_rank
flights3 <- flights
flights3 <- mutate(flights3, dep_rank = min_rank(desc(dep_delay)))
arrange(filter(flights3, dep_rank <= 10), by = dep_rank)
## # A tibble: 10 x 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # ... with 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, dep_rank <int>
#To handle ties, I like min_rank's approach of giving the minimum ranking to all tied elements, then moving on to the nth next ranking.
#That is pretty much how ranking is typically done.
#The two vectors this sequencing creates are not of equal length. Therefore, the smaller vector will wrap around until the end of the larger vector-- 1+1, 2+2, 3+3, 1+4, 2+5, 3+6, 1+7, 2+8, 3+9, 1+10
1:3 + 1:10
## Warning in 1:3 + 1:10: longer object length is not a multiple of shorter object
## length
## [1] 2 4 6 5 7 9 8 10 12 11
#?Trig
#According to the documentation, R provides the cos(x), sin(x), tan(x), arc-cos(x), arc-sin(x), arc-tan(x) functions
#as well as cospi(x), sinpi(x), tanpi(x), which would be the trig calculation of x*pi
Write up your solutions to the exercises in 5.6.7 in this document, including the code chunks you use to determine the answer.
1-A. A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time.
1-B. A flight is always 10 minutes late.
1-C. A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time.
1-D. 99% of the time a flight is on time. 1% of the time it’s 2 hours late.
1-E. Which is more important: arrival delay or departure delay?
#Ways to asses typical delay characteristics
#1- Length of departure/arrival delay
#2- Ratio of being late vs. early in departure/arrival
#3- Consistency of pattern
#4- Avg. number of connecting flights pre/post flight
#5- Occurrences of outlier major delays
#I would argue that arrival delay is far more important
#Arrival delay has more potential impact, especially for connecting flights, etc.
####
not_cancelled <- filter(flights, !(is.na(dep_delay) | is.na(arr_delay)))
#With count
not_cancelled %>%
count(dest)
## # A tibble: 104 x 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # ... with 94 more rows
not_cancelled %>%
count(tailnum, wt = distance)
## # A tibble: 4,037 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418
## 2 N0EGMQ 239143
## 3 N10156 109664
## 4 N102UW 25722
## 5 N103US 24619
## 6 N104UW 24616
## 7 N10575 139903
## 8 N105UW 23618
## 9 N107US 21677
## 10 N108UW 32070
## # ... with 4,027 more rows
#With group_by and summarize
not_cancelled %>%
group_by(dest) %>%
summarize(num_not_cancelled = n())
## # A tibble: 104 x 2
## dest num_not_cancelled
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # ... with 94 more rows
not_cancelled %>%
group_by(tailnum) %>%
summarize(total_distance_of_flights = sum(distance))
## # A tibble: 4,037 x 2
## tailnum total_distance_of_flights
## <chr> <dbl>
## 1 D942DN 3418
## 2 N0EGMQ 239143
## 3 N10156 109664
## 4 N102UW 25722
## 5 N103US 24619
## 6 N104UW 24616
## 7 N10575 139903
## 8 N105UW 23618
## 9 N107US 21677
## 10 N108UW 32070
## # ... with 4,027 more rows
#The most important between these two is dep_delay.
#A flight might have a reason for no arrival time info (diverted), but won't have any info if it never leaves
#i.e. There are no flights that don't leave but have arr_delay
count(filter(flights, (is.na(dep_delay) | is.na(arr_delay)))) #suboptimal
## # A tibble: 1 x 1
## n
## <int>
## 1 9430
count(filter(flights, (is.na(dep_delay) & !is.na(arr_delay)))) #num flights that don't have dep_delay but have arr_delay
## # A tibble: 1 x 1
## n
## <int>
## 1 0
count(filter(flights, is.na(dep_delay))) #proposed method for finding true num of cancelled flights
## # A tibble: 1 x 1
## n
## <int>
## 1 8255
#Create the new info columns
group_by(flights, year, month, day) %>%
summarize(num_flights = n(),
num_cancelled_flights = sum(is.na(dep_delay)),
prop_cancelled = sum(is.na(dep_delay)) / n(),
avg_dep_delay = mean(dep_delay, na.rm = TRUE),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)) -> flights3
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
flights3
## # A tibble: 365 x 8
## # Groups: year, month [12]
## year month day num_flights num_cancelled_fli~ prop_cancelled avg_dep_delay
## <int> <int> <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 842 4 0.00475 11.5
## 2 2013 1 2 943 8 0.00848 13.9
## 3 2013 1 3 914 10 0.0109 11.0
## 4 2013 1 4 915 6 0.00656 8.95
## 5 2013 1 5 720 3 0.00417 5.73
## 6 2013 1 6 832 1 0.00120 7.15
## 7 2013 1 7 933 3 0.00322 5.42
## 8 2013 1 8 899 4 0.00445 2.55
## 9 2013 1 9 902 5 0.00554 2.28
## 10 2013 1 10 932 3 0.00322 2.84
## # ... with 355 more rows, and 1 more variable: avg_arr_delay <dbl>
Graphing…
#When there are more than ~900 flights on a given day, the number (and %) of cancellations increase significantly.
#When either arrival or departure delays are greater than ~15 minutes, there is a moderate correlation with the percentage #of flights that get cancelled that day
flights3 %>%
ggplot() +
geom_point(mapping = aes(x = num_flights, y = num_cancelled_flights), color = 'mediumpurple1', alpha = 0.5)
flights3 %>%
ggplot() +
geom_point(mapping = aes(x = num_flights, y = prop_cancelled), color = 'mediumpurple3', alpha = 0.5)
flights3 %>%
ggplot(mapping = aes(y = prop_cancelled)) +
geom_point(mapping = aes(x = avg_dep_delay), color = 'red', alpha = 0.5) +
geom_point(mapping = aes(x = avg_arr_delay), color = 'green', alpha = 0.5)
rm(car_delays, car_delays_comp, car_delays_filter, car_delays_filter2, car_delays2, car_delays3)
## Warning in rm(car_delays, car_delays_comp, car_delays_filter,
## car_delays_filter2, : object 'car_delays' not found
## Warning in rm(car_delays, car_delays_comp, car_delays_filter,
## car_delays_filter2, : object 'car_delays_comp' not found
## Warning in rm(car_delays, car_delays_comp, car_delays_filter,
## car_delays_filter2, : object 'car_delays_filter' not found
## Warning in rm(car_delays, car_delays_comp, car_delays_filter,
## car_delays_filter2, : object 'car_delays_filter2' not found
## Warning in rm(car_delays, car_delays_comp, car_delays_filter,
## car_delays_filter2, : object 'car_delays2' not found
## Warning in rm(car_delays, car_delays_comp, car_delays_filter,
## car_delays_filter2, : object 'car_delays3' not found
#Worst Delays -- raw avg minutes
car_delays_filter <- group_by(flights, carrier)
car_delays_filter %>%
summarize(avg_dep_delay_carr = mean(dep_delay, na.rm = TRUE),
avg_arr_delay_carr = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(avg_arr_delay_carr)) -> car_delays
car_delays
## # A tibble: 16 x 3
## carrier avg_dep_delay_carr avg_arr_delay_carr
## <chr> <dbl> <dbl>
## 1 F9 20.2 21.9
## 2 FL 18.7 20.1
## 3 EV 20.0 15.8
## 4 YV 19.0 15.6
## 5 OO 12.6 11.9
## 6 MQ 10.6 10.8
## 7 WN 17.7 9.65
## 8 B6 13.0 9.46
## 9 9E 16.7 7.38
## 10 UA 12.1 3.56
## 11 US 3.78 2.13
## 12 VX 12.9 1.76
## 13 DL 9.26 1.64
## 14 AA 8.59 0.364
## 15 HA 4.90 -6.92
## 16 AS 5.80 -9.93
…
#Avg delays (all carriers) seen at each airport
group_by(car_delays_filter, dest) %>%
summarize(avg_dep_delay_dest = mean(dep_delay, na.rm = TRUE),
avg_arr_delay_dest = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(avg_arr_delay_dest)) -> car_delays2
car_delays2
## # A tibble: 105 x 3
## dest avg_dep_delay_dest avg_arr_delay_dest
## <chr> <dbl> <dbl>
## 1 CAE 35.6 41.8
## 2 TUL 34.9 33.7
## 3 OKC 30.6 30.6
## 4 JAC 26.5 28.1
## 5 TYS 28.5 24.1
## 6 MSN 23.6 20.2
## 7 RIC 23.6 20.1
## 8 CAK 20.8 19.7
## 9 DSM 26.2 19.0
## 10 GRR 19.5 18.2
## # ... with 95 more rows
…
#Worst delays with airport figured in
car_delays_filter2 <- group_by(flights, carrier, dest)
car_delays_filter2 %>%
summarize(avg_dep_delay_carr = mean(dep_delay, na.rm = TRUE),
avg_arr_delay_carr = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(avg_arr_delay_carr)) -> car_delays3
## `summarise()` has grouped output by 'carrier'. You can override using the `.groups` argument.
car_delays3
## # A tibble: 314 x 4
## # Groups: carrier [16]
## carrier dest avg_dep_delay_carr avg_arr_delay_carr
## <chr> <chr> <dbl> <dbl>
## 1 UA STL 77.5 110
## 2 OO ORD 67 107
## 3 OO DTW 61 68.5
## 4 UA RDU 60 56
## 5 EV CAE 36.7 42.8
## 6 EV TYS 41.8 41.2
## 7 EV PBI 48.7 40.7
## 8 EV TUL 34.9 33.7
## 9 EV OKC 30.6 30.6
## 10 UA JAC 28.7 29.9
## # ... with 304 more rows
#Make a comparison based on +/- avg delay at airport
car_delays_comp <- car_delays3
car_delays_comp %>%
left_join(select(car_delays2, c(dest, avg_arr_delay_dest)), by = 'dest') -> test
#drop_na(test) -> test
test
## # A tibble: 314 x 5
## # Groups: carrier [16]
## carrier dest avg_dep_delay_carr avg_arr_delay_carr avg_arr_delay_dest
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 UA STL 77.5 110 11.1
## 2 OO ORD 67 107 5.88
## 3 OO DTW 61 68.5 5.43
## 4 UA RDU 60 56 10.1
## 5 EV CAE 36.7 42.8 41.8
## 6 EV TYS 41.8 41.2 24.1
## 7 EV PBI 48.7 40.7 8.56
## 8 EV TUL 34.9 33.7 33.7
## 9 EV OKC 30.6 30.6 30.6
## 10 UA JAC 28.7 29.9 28.1
## # ... with 304 more rows
car_delays_comp <- mutate(test, adj_avg_arr_delay = avg_arr_delay_carr - avg_arr_delay_dest)
#flights2 %>%
# select(-origin,-dest) %>%
# left_join(airlines,by = "carrier")
#select(df2, c(id,value2)), by = "id")
#Summarize the mean of the adjusted delays
car_delays_comp %>%
summarize(mean_of_avg_delays_relative_to_all_carriers_avg = mean(adj_avg_arr_delay, na.rm = TRUE)) %>%
arrange(desc(mean_of_avg_delays_relative_to_all_carriers_avg))
## # A tibble: 16 x 2
## carrier mean_of_avg_delays_relative_to_all_carriers_avg
## <chr> <dbl>
## 1 OO 28.0
## 2 F9 13.3
## 3 B6 2.61
## 4 EV 2.16
## 5 MQ 1.49
## 6 VX 0.475
## 7 AA -0.153
## 8 UA -0.695
## 9 WN -1.23
## 10 FL -1.42
## 11 US -3.80
## 12 YV -4.65
## 13 HA -5.55
## 14 9E -6.69
## 15 AS -8.83
## 16 DL -9.87
#original
select(car_delays, -c(avg_dep_delay_carr))
## # A tibble: 16 x 2
## carrier avg_arr_delay_carr
## <chr> <dbl>
## 1 F9 21.9
## 2 FL 20.1
## 3 EV 15.8
## 4 YV 15.6
## 5 OO 11.9
## 6 MQ 10.8
## 7 WN 9.65
## 8 B6 9.46
## 9 9E 7.38
## 10 UA 3.56
## 11 US 2.13
## 12 VX 1.76
## 13 DL 1.64
## 14 AA 0.364
## 15 HA -6.92
## 16 AS -9.93
#NOTE
#I feel like I actually needed to go back and make the joins with the original full flights table
#so that when the means is taken here, the number of flights per carrier per airport is weighted properly??
#Using the sort argument places the most common occurrences first
#This is useful when doing any analysis based on frequency or volume.
not_cancelled %>%
count(dest, sort= TRUE)
## # A tibble: 104 x 2
## dest n
## <chr> <int>
## 1 ATL 16837
## 2 ORD 16566
## 3 LAX 16026
## 4 BOS 15022
## 5 MCO 13967
## 6 CLT 13674
## 7 SFO 13173
## 8 FLL 11897
## 9 MIA 11593
## 10 DCA 9111
## # ... with 94 more rows
You will turn in your work, first by posting the work to RPubs, you will probably need to get an account, they are free.
The work will be due by Midnight November 21.
You will get another project starting next week which will be due after Thanksgiving.