Exercises: 1-3 (Pgs. 49-50); 2-4 (Pg. 51); 2,4 (Pg. 54); 1-4 (Pg. 58)
Assigned: Friday, August 31, 2018
Due: Friday, September 7, 2018 by 5:00 PM
Submission: Submit via an electronic document on Sakai. Must be submitted as a html file generated in RStudio. All assigned problems are chosen according to the textbook R for Data Science.
#a had an arrival delay of two or more hours
filter(flights, arr_delay >= 120)
## Warning: package 'bindrcpp' was built under R version 3.4.4
## # A tibble: 10,200 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 811 630 101 1047
## 2 2013 1 1 848 1835 853 1001
## 3 2013 1 1 957 733 144 1056
## 4 2013 1 1 1114 900 134 1447
## 5 2013 1 1 1505 1310 115 1638
## 6 2013 1 1 1525 1340 105 1831
## 7 2013 1 1 1549 1445 64.0 1912
## 8 2013 1 1 1558 1359 119 1718
## 9 2013 1 1 1732 1630 62.0 2028
## 10 2013 1 1 1803 1620 103 2008
## # ... with 10,190 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#b flew to Houston
filter(flights, dest %in% c("IAH", "HOU"))
## # A tibble: 9,313 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2.00 830
## 2 2013 1 1 533 529 4.00 850
## 3 2013 1 1 623 627 - 4.00 933
## 4 2013 1 1 728 732 - 4.00 1041
## 5 2013 1 1 739 739 0 1104
## 6 2013 1 1 908 908 0 1228
## 7 2013 1 1 1028 1026 2.00 1350
## 8 2013 1 1 1044 1045 - 1.00 1352
## 9 2013 1 1 1114 900 134 1447
## 10 2013 1 1 1205 1200 5.00 1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#c Were operated by United, American, or Delta
filter(flights, carrier %in% c("AA", "DL", "UA"))
## # A tibble: 139,504 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2.00 830
## 2 2013 1 1 533 529 4.00 850
## 3 2013 1 1 542 540 2.00 923
## 4 2013 1 1 554 600 -6.00 812
## 5 2013 1 1 554 558 -4.00 740
## 6 2013 1 1 558 600 -2.00 753
## 7 2013 1 1 558 600 -2.00 924
## 8 2013 1 1 558 600 -2.00 923
## 9 2013 1 1 559 600 -1.00 941
## 10 2013 1 1 559 600 -1.00 854
## # ... with 139,494 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#d Departed in July, August, or September
filter(flights, month >= 7, month <= 9)
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212 236
## 2 2013 7 1 2 2359 3.00 344
## 3 2013 7 1 29 2245 104 151
## 4 2013 7 1 43 2130 193 322
## 5 2013 7 1 44 2150 174 300
## 6 2013 7 1 46 2051 235 304
## 7 2013 7 1 48 2001 287 308
## 8 2013 7 1 58 2155 183 335
## 9 2013 7 1 100 2146 194 327
## 10 2013 7 1 100 2245 135 337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#e Arrived more than two hours late, but didn't leave late
filter(flights, dep_delay <= 0, arr_delay > 120)
## # A tibble: 29 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 27 1419 1420 -1.00 1754
## 2 2013 10 7 1350 1350 0 1736
## 3 2013 10 7 1357 1359 -2.00 1858
## 4 2013 10 16 657 700 -3.00 1258
## 5 2013 11 1 658 700 -2.00 1329
## 6 2013 3 18 1844 1847 -3.00 39
## 7 2013 4 17 1635 1640 -5.00 2049
## 8 2013 4 18 558 600 -2.00 1149
## 9 2013 4 18 655 700 -5.00 1213
## 10 2013 5 22 1827 1830 -3.00 2217
## # ... with 19 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#f Were delayed by at least an hour, but made up over 30 minutes in flight
filter(flights, dep_delay >= 60, dep_delay - arr_delay > 30)
## # A tibble: 1,844 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 2205 1720 285 46
## 2 2013 1 1 2326 2130 116 131
## 3 2013 1 3 1503 1221 162 1803
## 4 2013 1 3 1839 1700 99.0 2056
## 5 2013 1 3 1850 1745 65.0 2148
## 6 2013 1 3 1941 1759 102 2246
## 7 2013 1 3 1950 1845 65.0 2228
## 8 2013 1 3 2015 1915 60.0 2135
## 9 2013 1 3 2257 2000 177 45
## 10 2013 1 4 1917 1700 137 2135
## # ... with 1,834 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#g Departed between midnight and 6 a.m
filter(flights, dep_time <=600, dep_time ==2400)
## # A tibble: 0 x 19
## # ... with 19 variables: year <int>, month <int>, day <int>,
## # dep_time <int>, sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# between() takes 3 values, x, let, and right. X is the value you are comparing to, which is greater than the left value and less than the right value. Using this function, we can find the flights that took off between July and September. X should be greater than or equal to 7 and less than or equal to 9.
filter(flights, between(month, 7,9))
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212 236
## 2 2013 7 1 2 2359 3.00 344
## 3 2013 7 1 29 2245 104 151
## 4 2013 7 1 43 2130 193 322
## 5 2013 7 1 44 2150 174 300
## 6 2013 7 1 46 2051 235 304
## 7 2013 7 1 48 2001 287 308
## 8 2013 7 1 58 2155 183 335
## 9 2013 7 1 100 2146 194 327
## 10 2013 7 1 100 2245 135 337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# 8,255 flights have a missing departing time. These flights are also missing an arrival time. Since there is no take off or landing time the flights were probably canceled.
filter(flights,is.na(dep_time))
## # A tibble: 8,255 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 NA 1630 NA NA
## 2 2013 1 1 NA 1935 NA NA
## 3 2013 1 1 NA 1500 NA NA
## 4 2013 1 1 NA 600 NA NA
## 5 2013 1 2 NA 1540 NA NA
## 6 2013 1 2 NA 1620 NA NA
## 7 2013 1 2 NA 1355 NA NA
## 8 2013 1 2 NA 1420 NA NA
## 9 2013 1 2 NA 1321 NA NA
## 10 2013 1 2 NA 1545 NA NA
## # ... with 8,245 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# Descending order of dep_delay will give the most delayed flights first
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 6 15 1432 1935 1137 1607
## 3 2013 1 10 1121 1635 1126 1239
## 4 2013 9 20 1139 1845 1014 1457
## 5 2013 7 22 845 1600 1005 1044
## 6 2013 4 10 1100 1900 960 1342
## 7 2013 3 17 2321 810 911 135
## 8 2013 6 27 959 1900 899 1236
## 9 2013 7 22 2257 759 898 121
## 10 2013 12 5 756 1700 896 1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#Using ascending order for dep_delay will give the flights that left the earliest, starting with the ones that took off before their scheduled time, then the flights on time, follwed by the flights that were late to take off.
arrange(flights, dep_delay)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 12 7 2040 2123 -43.0 40
## 2 2013 2 3 2022 2055 -33.0 2240
## 3 2013 11 10 1408 1440 -32.0 1549
## 4 2013 1 11 1900 1930 -30.0 2233
## 5 2013 1 29 1703 1730 -27.0 1947
## 6 2013 8 9 729 755 -26.0 1002
## 7 2013 10 23 1907 1932 -25.0 2143
## 8 2013 3 30 2030 2055 -25.0 2213
## 9 2013 3 2 1431 1455 -24.0 1601
## 10 2013 5 5 934 958 -24.0 1225
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# Using the head function in combination with arranging the flights in oder from shortest air time to longest, a table with the six fastest flights are displayed. To see the entire list in order the head function can be removed and the code arrange(flights, air_time) can be used.
head(arrange(flights, air_time))
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 16 1355 1315 40.0 1442
## 2 2013 4 13 537 527 10.0 622
## 3 2013 12 6 922 851 31.0 1021
## 4 2013 2 3 2153 2129 24.0 2247
## 5 2013 2 5 1303 1315 -12.0 1342
## 6 2013 2 12 2123 2130 - 7.00 2211
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
# The flight that was supposed to be the shortest distance (17 miles) from Newark, NJ (EWR) to New York (LGA) was cancelled. The next shortest flight in distance was Newark (EWR) to Philidelphia (PHL) with a distance of 80 miles.
head(arrange(flights, distance))
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 27 NA 106 NA NA
## 2 2013 1 3 2127 2129 - 2.00 2222
## 3 2013 1 4 1240 1200 40.0 1333
## 4 2013 1 4 1829 1615 134 1937
## 5 2013 1 4 2128 2129 - 1.00 2218
## 6 2013 1 5 1155 1200 - 5.00 1241
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
#The longest flight in miles was 4,983 and it was from New York (JFK) and Hawaii (HNL)
head(arrange(flights, desc(distance)))
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 857 900 - 3.00 1516
## 2 2013 1 2 909 900 9.00 1525
## 3 2013 1 3 914 900 14.0 1504
## 4 2013 1 4 900 900 0 1516
## 5 2013 1 5 858 900 - 2.00 1519
## 6 2013 1 6 1019 900 79.0 1558
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
#If you include the name of a variable multiple times while using the select() call, it ignores the repeats.
select(flights, year, month, day, day)
## # A tibble: 336,776 x 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ... with 336,766 more rows
#The following code displays every entry that contains the word "time" whether it is lower case or upper case. This surprises me because I have worked with other programs that are case sensitive. The select helpers are not case sensitive, meaning that when you use contains the results can either be upper case or lower case. To fix this default and make select helpers case sensitive you must also add ignore.case = FALSE. The code would then look like: select(flights, contains("TIME", ignore.case = FALSE))
select(flights, contains("TIME"))
## # A tibble: 336,776 x 6
## dep_time sched_dep_time arr_time sched_arr_time air_time
## <int> <int> <int> <int> <dbl>
## 1 517 515 830 819 227
## 2 533 529 850 830 227
## 3 542 540 923 850 160
## 4 544 545 1004 1022 183
## 5 554 600 812 837 116
## 6 554 558 740 728 150
## 7 555 600 913 854 158
## 8 557 600 709 723 53.0
## 9 557 600 838 846 140
## 10 558 600 753 745 138
## # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>
#
minutes <- function(x) {x %/% 100 * 60 + x %% 100}
mutate(flights,
dep_time_mins = minutes(dep_time),sched_dep_time_mins = minutes(sched_dep_time)) %>% select(dep_time, dep_time_mins, sched_dep_time, sched_dep_time_mins)
## # A tibble: 336,776 x 4
## dep_time dep_time_mins sched_dep_time sched_dep_time_mins
## <int> <dbl> <int> <dbl>
## 1 517 317 515 315
## 2 533 333 529 329
## 3 542 342 540 340
## 4 544 344 545 345
## 5 554 354 600 360
## 6 554 354 558 358
## 7 555 355 600 360
## 8 557 357 600 360
## 9 557 357 600 360
## 10 558 358 600 360
## # ... with 336,766 more rows
#When looking at arr_time - dep_time and air_time, I expected these two quantities to be equal, however, this was not the case for some flights. The differences could be do the the fact that the plane was traveling through different time zones which would affect the time it landed. To fix this, if there is a change in time zones, you can alter the arrival time to be in the same time zone as where the plane departed from.
air_times <- mutate(flights, arr_time_min = arr_time %/% 100 * 60 + arr_time %% 100, dep_time_min = dep_time %/% 100 * 60 + dep_time %% 100,air_time_2 = (arr_time_min - dep_time_min + 1440) %% 1440, air_time_diff = air_time_2 - air_time)
air_times %>%
arrange(desc(abs(air_time_diff))) %>%
select(air_time_diff)
## # A tibble: 336,776 x 1
## air_time_diff
## <dbl>
## 1 -345
## 2 -345
## 3 -345
## 4 -345
## 5 -344
## 6 -344
## 7 -344
## 8 -344
## 9 -344
## 10 -343
## # ... with 336,766 more rows
# I would expect the difference between dep_time and sched_dep_time to equal dep_delay.
mutate(flights,dep_delay2 = minutes(dep_time) - minutes(sched_dep_time)) %>%
filter(dep_delay2 != dep_delay) %>% select(dep_time, sched_dep_time, dep_delay, dep_delay2)
## # A tibble: 1,207 x 4
## dep_time sched_dep_time dep_delay dep_delay2
## <int> <int> <dbl> <dbl>
## 1 848 1835 853 - 587
## 2 42 2359 43.0 -1397
## 3 126 2250 156 -1284
## 4 32 2359 33.0 -1407
## 5 50 2145 185 -1255
## 6 235 2359 156 -1284
## 7 25 2359 26.0 -1414
## 8 106 2245 141 -1299
## 9 14 2359 15.0 -1425
## 10 37 2230 127 -1313
## # ... with 1,197 more rows
# If several flights were tied for most delayed they would all be accounted for as being tied for the lowest ranking.
mutate(flights,dep_delay_rank = min_rank(-dep_delay)) %>% arrange(dep_delay_rank) %>% filter(dep_delay_rank <= 10)
## # A tibble: 10 x 20
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 6 15 1432 1935 1137 1607
## 3 2013 1 10 1121 1635 1126 1239
## 4 2013 9 20 1139 1845 1014 1457
## 5 2013 7 22 845 1600 1005 1044
## 6 2013 4 10 1100 1900 960 1342
## 7 2013 3 17 2321 810 911 135
## 8 2013 6 27 959 1900 899 1236
## 9 2013 7 22 2257 759 898 121
## 10 2013 12 5 756 1700 896 1058
## # ... with 13 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, dep_delay_rank <int>