Instructions

Exercises: 1-3 (Pgs. 49-50); 2-4 (Pg. 51); 2,4 (Pg. 54); 1-4 (Pg. 58)

Assigned: Friday, August 31, 2018

Due: Friday, September 7, 2018 by 5:00 PM

Submission: Submit via an electronic document on Sakai. Must be submitted as a html file generated in RStudio. All assigned problems are chosen according to the textbook R for Data Science.

Chapter 3 (Pgs. 49-50)

Exercise 1

#a had an arrival delay of two or more hours
filter(flights, arr_delay >= 120)
## Warning: package 'bindrcpp' was built under R version 3.4.4
## # A tibble: 10,200 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      811            630     101       1047
##  2  2013     1     1      848           1835     853       1001
##  3  2013     1     1      957            733     144       1056
##  4  2013     1     1     1114            900     134       1447
##  5  2013     1     1     1505           1310     115       1638
##  6  2013     1     1     1525           1340     105       1831
##  7  2013     1     1     1549           1445      64.0     1912
##  8  2013     1     1     1558           1359     119       1718
##  9  2013     1     1     1732           1630      62.0     2028
## 10  2013     1     1     1803           1620     103       2008
## # ... with 10,190 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
#b flew to Houston 
filter(flights, dest %in% c("IAH", "HOU"))
## # A tibble: 9,313 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515      2.00      830
##  2  2013     1     1      533            529      4.00      850
##  3  2013     1     1      623            627   -  4.00      933
##  4  2013     1     1      728            732   -  4.00     1041
##  5  2013     1     1      739            739      0        1104
##  6  2013     1     1      908            908      0        1228
##  7  2013     1     1     1028           1026      2.00     1350
##  8  2013     1     1     1044           1045   -  1.00     1352
##  9  2013     1     1     1114            900    134        1447
## 10  2013     1     1     1205           1200      5.00     1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
#c Were operated by United, American, or Delta
filter(flights, carrier %in% c("AA", "DL", "UA"))
## # A tibble: 139,504 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515      2.00      830
##  2  2013     1     1      533            529      4.00      850
##  3  2013     1     1      542            540      2.00      923
##  4  2013     1     1      554            600     -6.00      812
##  5  2013     1     1      554            558     -4.00      740
##  6  2013     1     1      558            600     -2.00      753
##  7  2013     1     1      558            600     -2.00      924
##  8  2013     1     1      558            600     -2.00      923
##  9  2013     1     1      559            600     -1.00      941
## 10  2013     1     1      559            600     -1.00      854
## # ... with 139,494 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
#d Departed in July, August, or September
filter(flights, month >= 7,  month <= 9)
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     7     1        1           2029    212         236
##  2  2013     7     1        2           2359      3.00      344
##  3  2013     7     1       29           2245    104         151
##  4  2013     7     1       43           2130    193         322
##  5  2013     7     1       44           2150    174         300
##  6  2013     7     1       46           2051    235         304
##  7  2013     7     1       48           2001    287         308
##  8  2013     7     1       58           2155    183         335
##  9  2013     7     1      100           2146    194         327
## 10  2013     7     1      100           2245    135         337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
#e Arrived more than two hours late, but didn't leave late
filter(flights, dep_delay <= 0, arr_delay > 120)
## # A tibble: 29 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1    27     1419           1420     -1.00     1754
##  2  2013    10     7     1350           1350      0        1736
##  3  2013    10     7     1357           1359     -2.00     1858
##  4  2013    10    16      657            700     -3.00     1258
##  5  2013    11     1      658            700     -2.00     1329
##  6  2013     3    18     1844           1847     -3.00       39
##  7  2013     4    17     1635           1640     -5.00     2049
##  8  2013     4    18      558            600     -2.00     1149
##  9  2013     4    18      655            700     -5.00     1213
## 10  2013     5    22     1827           1830     -3.00     2217
## # ... with 19 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
#f Were delayed by at least an hour, but made up over 30 minutes in flight
filter(flights, dep_delay >= 60, dep_delay - arr_delay > 30)
## # A tibble: 1,844 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1     2205           1720     285         46
##  2  2013     1     1     2326           2130     116        131
##  3  2013     1     3     1503           1221     162       1803
##  4  2013     1     3     1839           1700      99.0     2056
##  5  2013     1     3     1850           1745      65.0     2148
##  6  2013     1     3     1941           1759     102       2246
##  7  2013     1     3     1950           1845      65.0     2228
##  8  2013     1     3     2015           1915      60.0     2135
##  9  2013     1     3     2257           2000     177         45
## 10  2013     1     4     1917           1700     137       2135
## # ... with 1,834 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
#g Departed between midnight and 6 a.m 
filter(flights, dep_time <=600, dep_time ==2400)
## # A tibble: 0 x 19
## # ... with 19 variables: year <int>, month <int>, day <int>,
## #   dep_time <int>, sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Exercise 2

# between() takes 3 values, x, let, and right. X is the value you are comparing to, which is greater than the left value and less than the right value. Using this function, we can find the flights that took off between July and September. X should be greater than or equal to 7 and less than or equal to 9. 

filter(flights, between(month, 7,9))
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     7     1        1           2029    212         236
##  2  2013     7     1        2           2359      3.00      344
##  3  2013     7     1       29           2245    104         151
##  4  2013     7     1       43           2130    193         322
##  5  2013     7     1       44           2150    174         300
##  6  2013     7     1       46           2051    235         304
##  7  2013     7     1       48           2001    287         308
##  8  2013     7     1       58           2155    183         335
##  9  2013     7     1      100           2146    194         327
## 10  2013     7     1      100           2245    135         337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Exercise 3

# 8,255 flights have a missing departing time. These flights are also missing an arrival time. Since there is no take off or landing time the flights were probably canceled. 
filter(flights,is.na(dep_time))
## # A tibble: 8,255 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1       NA           1630        NA       NA
##  2  2013     1     1       NA           1935        NA       NA
##  3  2013     1     1       NA           1500        NA       NA
##  4  2013     1     1       NA            600        NA       NA
##  5  2013     1     2       NA           1540        NA       NA
##  6  2013     1     2       NA           1620        NA       NA
##  7  2013     1     2       NA           1355        NA       NA
##  8  2013     1     2       NA           1420        NA       NA
##  9  2013     1     2       NA           1321        NA       NA
## 10  2013     1     2       NA           1545        NA       NA
## # ... with 8,245 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Chapter 3 (Pg. 51)

Exercise 2

# Descending order of dep_delay will give the most delayed flights first 
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     9      641            900      1301     1242
##  2  2013     6    15     1432           1935      1137     1607
##  3  2013     1    10     1121           1635      1126     1239
##  4  2013     9    20     1139           1845      1014     1457
##  5  2013     7    22      845           1600      1005     1044
##  6  2013     4    10     1100           1900       960     1342
##  7  2013     3    17     2321            810       911      135
##  8  2013     6    27      959           1900       899     1236
##  9  2013     7    22     2257            759       898      121
## 10  2013    12     5      756           1700       896     1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
#Using ascending order for dep_delay will give the flights that left the earliest, starting with the ones that took off before their scheduled time, then the flights on time, follwed by the flights that were late to take off. 
arrange(flights, dep_delay)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    12     7     2040           2123     -43.0       40
##  2  2013     2     3     2022           2055     -33.0     2240
##  3  2013    11    10     1408           1440     -32.0     1549
##  4  2013     1    11     1900           1930     -30.0     2233
##  5  2013     1    29     1703           1730     -27.0     1947
##  6  2013     8     9      729            755     -26.0     1002
##  7  2013    10    23     1907           1932     -25.0     2143
##  8  2013     3    30     2030           2055     -25.0     2213
##  9  2013     3     2     1431           1455     -24.0     1601
## 10  2013     5     5      934            958     -24.0     1225
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Exercise 3

# Using the head function in combination with arranging the flights in oder from shortest air time to longest, a table with the six fastest flights are displayed. To see the entire list in order the head function can be removed and the code arrange(flights, air_time) can be used. 
head(arrange(flights, air_time))
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1    16     1355           1315     40.0      1442
## 2  2013     4    13      537            527     10.0       622
## 3  2013    12     6      922            851     31.0      1021
## 4  2013     2     3     2153           2129     24.0      2247
## 5  2013     2     5     1303           1315    -12.0      1342
## 6  2013     2    12     2123           2130    - 7.00     2211
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Exercise 4

# The flight that was supposed to be the shortest distance (17 miles) from Newark, NJ (EWR) to New York (LGA) was cancelled. The next shortest flight in distance was Newark (EWR) to Philidelphia (PHL) with a distance of 80 miles.
head(arrange(flights, distance))
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     7    27       NA            106     NA          NA
## 2  2013     1     3     2127           2129   -  2.00     2222
## 3  2013     1     4     1240           1200     40.0      1333
## 4  2013     1     4     1829           1615    134        1937
## 5  2013     1     4     2128           2129   -  1.00     2218
## 6  2013     1     5     1155           1200   -  5.00     1241
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
#The longest flight in miles was 4,983 and it was from New York (JFK) and Hawaii (HNL)
head(arrange(flights, desc(distance)))
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      857            900    - 3.00     1516
## 2  2013     1     2      909            900      9.00     1525
## 3  2013     1     3      914            900     14.0      1504
## 4  2013     1     4      900            900      0        1516
## 5  2013     1     5      858            900    - 2.00     1519
## 6  2013     1     6     1019            900     79.0      1558
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Chapter 3 (Pg. 54)

Exercise 2

#If you include the name of a variable multiple times while using the select() call, it ignores the repeats.
select(flights, year, month, day, day)
## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows

Exercise 4

#The following code displays every entry that contains the word "time" whether it is lower case or upper case. This surprises me because I have worked with other programs that are case sensitive. The select helpers are not case sensitive, meaning that when you use contains the results can either be upper case or lower case. To fix this default and make select helpers case sensitive you must also add ignore.case = FALSE. The code would then look like:   select(flights, contains("TIME", ignore.case = FALSE))

select(flights, contains("TIME"))
## # A tibble: 336,776 x 6
##    dep_time sched_dep_time arr_time sched_arr_time air_time
##       <int>          <int>    <int>          <int>    <dbl>
##  1      517            515      830            819    227  
##  2      533            529      850            830    227  
##  3      542            540      923            850    160  
##  4      544            545     1004           1022    183  
##  5      554            600      812            837    116  
##  6      554            558      740            728    150  
##  7      555            600      913            854    158  
##  8      557            600      709            723     53.0
##  9      557            600      838            846    140  
## 10      558            600      753            745    138  
## # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>

Chapter 3 (Pg. 58)

Exercise 1

#
minutes <- function(x) {x %/% 100 * 60 + x %% 100}
mutate(flights,
       dep_time_mins = minutes(dep_time),sched_dep_time_mins = minutes(sched_dep_time)) %>% select(dep_time, dep_time_mins, sched_dep_time, sched_dep_time_mins)
## # A tibble: 336,776 x 4
##    dep_time dep_time_mins sched_dep_time sched_dep_time_mins
##       <int>         <dbl>          <int>               <dbl>
##  1      517           317            515                 315
##  2      533           333            529                 329
##  3      542           342            540                 340
##  4      544           344            545                 345
##  5      554           354            600                 360
##  6      554           354            558                 358
##  7      555           355            600                 360
##  8      557           357            600                 360
##  9      557           357            600                 360
## 10      558           358            600                 360
## # ... with 336,766 more rows

Exercise 2

#When looking at arr_time - dep_time and air_time, I expected these two quantities to be equal, however, this was not the case for some flights. The differences could be do the the fact that the plane was traveling through different time zones which would affect the time it landed. To fix this, if there is a change in time zones, you can alter the arrival time to be in the same time zone as where the plane departed from. 

air_times <- mutate(flights, arr_time_min = arr_time %/% 100 * 60 + arr_time %% 100, dep_time_min = dep_time %/% 100 * 60 + dep_time %% 100,air_time_2 = (arr_time_min - dep_time_min + 1440) %% 1440, air_time_diff = air_time_2 - air_time)

air_times %>%
  arrange(desc(abs(air_time_diff))) %>%
  select(air_time_diff)
## # A tibble: 336,776 x 1
##    air_time_diff
##            <dbl>
##  1          -345
##  2          -345
##  3          -345
##  4          -345
##  5          -344
##  6          -344
##  7          -344
##  8          -344
##  9          -344
## 10          -343
## # ... with 336,766 more rows

Exercise 3

# I would expect the difference between dep_time and sched_dep_time to equal dep_delay. 

mutate(flights,dep_delay2 = minutes(dep_time) - minutes(sched_dep_time)) %>%
  filter(dep_delay2 != dep_delay) %>% select(dep_time, sched_dep_time, dep_delay, dep_delay2)
## # A tibble: 1,207 x 4
##    dep_time sched_dep_time dep_delay dep_delay2
##       <int>          <int>     <dbl>      <dbl>
##  1      848           1835     853        - 587
##  2       42           2359      43.0      -1397
##  3      126           2250     156        -1284
##  4       32           2359      33.0      -1407
##  5       50           2145     185        -1255
##  6      235           2359     156        -1284
##  7       25           2359      26.0      -1414
##  8      106           2245     141        -1299
##  9       14           2359      15.0      -1425
## 10       37           2230     127        -1313
## # ... with 1,197 more rows

Exercise 4

# If several flights were tied for most delayed they would all be accounted for as being tied for the lowest ranking.
mutate(flights,dep_delay_rank = min_rank(-dep_delay)) %>% arrange(dep_delay_rank) %>% filter(dep_delay_rank <= 10)
## # A tibble: 10 x 20
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     9      641            900      1301     1242
##  2  2013     6    15     1432           1935      1137     1607
##  3  2013     1    10     1121           1635      1126     1239
##  4  2013     9    20     1139           1845      1014     1457
##  5  2013     7    22      845           1600      1005     1044
##  6  2013     4    10     1100           1900       960     1342
##  7  2013     3    17     2321            810       911      135
##  8  2013     6    27      959           1900       899     1236
##  9  2013     7    22     2257            759       898      121
## 10  2013    12     5      756           1700       896     1058
## # ... with 13 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, dep_delay_rank <int>