library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(fueleconomy)
## Warning: package 'fueleconomy' was built under R version 4.0.4

13.4.6 Exercises

1. Compute the average delay by destination, then join on the airports data frame so you can show the spatial distribution of delays. Here’s an easy way to draw a map of the United States:

#find the average the delay of arrivals at airports
flights_delay <- flights %>%
  group_by(dest) %>%
  #summarize average arrival delays
  summarise(Avg_delay = mean(arr_delay, na.rm = TRUE)) %>%
  #join airports table to get the location of the airports
  inner_join(airports, by = c("dest" = "faa"))

#Graph location of destination airports on a map and flight delays by using a color gradient
flights_delay  %>%
  ggplot(aes(lon, lat, colour = Avg_delay)) +
  borders("state") +
  geom_point() +
  coord_quickmap()+
  ylab("latitude")+
  xlab("longitude")+
  ggtitle("Average Delay by Arrival Times")

2. Add the location of the origin and destination (i.e. the lat and lon) to flights.

#subset the faa, lon, and lat variable from the airports table
location <- airports %>%
  select(faa, lon, lat)

flights %>%
  #join the location table to get the origin and destination longitude and latitude locations
  left_join(location, by = c("origin" = "faa")) %>%
  left_join(location, by = c("dest" = "faa"), suffix = c("_origin", "_dest"))
## # A tibble: 336,776 x 23
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 15 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   lon_origin <dbl>, lat_origin <dbl>, lon_dest <dbl>, lat_dest <dbl>

3. Is there a relationship between the age of a plane and its delays?

#create table to with age of the planes
age <- planes %>%
  #create a new variable called age by calculating the difference of the year 2013 by the values in the year column
  mutate(age = 2013 - year)

flights_delay_age <- flights %>%
  #join age table
  left_join(age, by = "tailnum") %>%
  #filter out planes with an age of 0
  filter(age != 0)%>%
  group_by(age) %>%
  #summarize average departure delay by the age of the planes
  summarise(Avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ungroup()

#graph the table of the age of the plane by the average delay
flights_delay_age %>%
ggplot(aes( x =age, y = Avg_delay )) +
  geom_point() +
  ylab("Average Departure Delay")+
  xlab("Age of Plane")+
  ggtitle("Affect of Age on Departure Delays")

There is no correlation between the age of the plane the average delays it experiences.

4. What weather conditions make it more likely to see a delay?

#
flight_weather <- flights %>%
  #join weather table
  left_join(weather, by = "time_hour", suffix = c("_flight", "_weather"))  %>%
group_by(visib) %>%
  #summarize average depature delay by level of visibilty
  summarise(Avg_dep_delay = mean(dep_delay, na.rm = TRUE))

#Graph
flight_weather %>%
  ggplot(aes(x = visib, y = Avg_dep_delay)) +
  geom_line()+
  ylab("Average Departure Delay")+
  xlab("Visibility")+
  ggtitle("Correlation Between Visibility and Average Departure Delays")
## Warning: Removed 1 row(s) containing missing values (geom_path).

Higher delays were experienced during low Visibility.

5. What happened on June 13 2013? Display the spatial pattern of delays, and then use Google to cross-reference with the weather.

#create table to visiulize weather affects on delays in 6/13/2013
june <- flights %>%
  #filter for the date
  filter( month == 6 & day == 13 & year == 2013) %>%
  group_by(dest) %>%
  #summarize average delay by destination
  summarise(Avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  #join airports table
inner_join(airports, by = c("dest" = "faa")) 

#Graph map of delays
june %>%
ggplot(aes(y = lat, x = lon, color = Avg_arr_delay)) +
  borders("state") +
  geom_point() +
  coord_quickmap()+
  ylab("latitude")+
  xlab("longitude")+
  ggtitle("Average Delay by Arrival Times on 6/13/2013")

The midwestern and southeastern United States were affected by two derecho thunderstorms.

Exercise 13.5.1

1. What does it mean for a flight to have a missing tailnum? What do the tail numbers that don’t have a matching record in planes have in common? (Hint: one variable explains ~90% of the problems.)

flights %>%
  #anti_join planes table on tail number to remove planes with tail numbers from the flights table
    anti_join(planes, by = "tailnum") %>%
  #count carrier that do have planes without a tail number 
    count(carrier, sort = TRUE)
## # A tibble: 10 x 2
##    carrier     n
##    <chr>   <int>
##  1 MQ      25397
##  2 AA      22558
##  3 UA       1693
##  4 9E       1044
##  5 B6        830
##  6 US        699
##  7 FL        187
##  8 DL        110
##  9 F9         50
## 10 WN         38
flights %>%
  #filter for flights that do not have a tail number
    filter(is.na(tailnum))
## # A tibble: 2,512 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     2       NA           1545        NA       NA           1910
##  2  2013     1     2       NA           1601        NA       NA           1735
##  3  2013     1     3       NA            857        NA       NA           1209
##  4  2013     1     3       NA            645        NA       NA            952
##  5  2013     1     4       NA            845        NA       NA           1015
##  6  2013     1     4       NA           1830        NA       NA           2044
##  7  2013     1     5       NA            840        NA       NA           1001
##  8  2013     1     7       NA            820        NA       NA            958
##  9  2013     1     8       NA           1645        NA       NA           1838
## 10  2013     1     9       NA            755        NA       NA           1012
## # ... with 2,502 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

The planes with missing tail numbers have missing departure and arrival times. It can be assumed that the planes without tail numbers were grounded and didn’t leave the origin airport. American Airways (AA) and Envoy Air (MQ) have a higher number of planes without a tail number because they report fleet numbers rather than tail numbers.

2. Filter flights to only show flights with planes that have flown at least 100 flights

#find planes that conducted at least 100 flights
FLT_100<- flights %>%
  group_by(tailnum) %>%
  #count tail number to count the number of flights a plane has conducted
  count(tailnum) %>%
  #filter for flights completed 100 or more flights
  filter(n >= 100)

flights %>%
  #semi join FLT_100 table to remove planes that have had less than 100 flights from the flights table
  semi_join(FLT_100, by = "tailnum")
## # A tibble: 230,902 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      544            545        -1     1004           1022
##  4  2013     1     1      554            558        -4      740            728
##  5  2013     1     1      555            600        -5      913            854
##  6  2013     1     1      557            600        -3      709            723
##  7  2013     1     1      557            600        -3      838            846
##  8  2013     1     1      558            600        -2      849            851
##  9  2013     1     1      558            600        -2      853            856
## 10  2013     1     1      558            600        -2      923            937
## # ... with 230,892 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

3. Combine fueleconomy::vehicles and fueleconomy::common to find only the records for the most common models.

vehicles %>%
  #semi join the common table keep only the common vehicles in the vehicles table
  semi_join(common)
## Joining, by = c("make", "model")
## # A tibble: 14,531 x 12
##       id make  model   year class   trans  drive     cyl displ fuel    hwy   cty
##    <dbl> <chr> <chr>  <dbl> <chr>   <chr>  <chr>   <dbl> <dbl> <chr> <dbl> <dbl>
##  1  1833 Acura Integ~  1986 Subcom~ Autom~ Front-~     4   1.6 Regu~    28    22
##  2  1834 Acura Integ~  1986 Subcom~ Manua~ Front-~     4   1.6 Regu~    28    23
##  3  3037 Acura Integ~  1987 Subcom~ Autom~ Front-~     4   1.6 Regu~    28    22
##  4  3038 Acura Integ~  1987 Subcom~ Manua~ Front-~     4   1.6 Regu~    28    23
##  5  4183 Acura Integ~  1988 Subcom~ Autom~ Front-~     4   1.6 Regu~    27    22
##  6  4184 Acura Integ~  1988 Subcom~ Manua~ Front-~     4   1.6 Regu~    28    23
##  7  5303 Acura Integ~  1989 Subcom~ Autom~ Front-~     4   1.6 Regu~    27    22
##  8  5304 Acura Integ~  1989 Subcom~ Manua~ Front-~     4   1.6 Regu~    28    23
##  9  6442 Acura Integ~  1990 Subcom~ Autom~ Front-~     4   1.8 Regu~    24    20
## 10  6443 Acura Integ~  1990 Subcom~ Manua~ Front-~     4   1.8 Regu~    26    21
## # ... with 14,521 more rows

4. Find the 48 hours (over the course of the whole year) that have the worst delays. Cross-reference it with the weather data. Can you see any patterns?

two_day <-
flights %>%
  group_by(month, day) %>%
  #Summarize total delay by month and day
  summarize(tot_delay = sum(arr_delay + dep_delay, na.rm = TRUE)) %>%
  #create a variable for a two day delay by using the total delay from the summarize function and the lag function to add the delay for two consecutive days
  mutate(twoday = tot_delay + lag(tot_delay)) %>%
  arrange(desc(twoday))
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
weather2 <-
  weather %>%
  group_by(month, day) %>%
  #summarize the mean of the weather conditions by month and day
  summarize(across(c(humid, precip, temp, visib), .fns = mean, na.rm = TRUE))
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
two_day_weather <-
  two_day %>%
  #join the weather2 table with the two_day table
  left_join(weather2) %>%
  #arrange by the two day delay time in descending order
  arrange(desc(twoday))
## Joining, by = c("month", "day")
#create graph to show the affect of weather conditions on delay times of 2 consecutive days
two_day_weather %>%
  ggplot(aes(x= twoday, y = humid, color = visib, size = precip))+
  geom_point() +
  ylab("Relative Humidity") +
  xlab("Total 48 Hour Delay Between Consecutive Days") +
  ggtitle("Weather Affects on Flight Delays Within 48 Hours")
## Warning: Removed 13 rows containing missing values (geom_point).

The worst forty-eight-hour delay was recorded on July 23, 2013. Forty-eight-hour delays seem to have a positive correlation with relative humidity.

5. What does anti_join(flights, airports, by = c(“dest” = “faa”)) tell you? What does anti_join(airports, flights, by = c(“faa” = “dest”)) tell you?

flights %>%
  #anti join airports table
  anti_join(airports, by = c("dest" = "faa"))
## # A tibble: 7,602 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      544            545        -1     1004           1022
##  2  2013     1     1      615            615         0     1039           1100
##  3  2013     1     1      628            630        -2     1137           1140
##  4  2013     1     1      701            700         1     1123           1154
##  5  2013     1     1      711            715        -4     1151           1206
##  6  2013     1     1      820            820         0     1254           1310
##  7  2013     1     1      820            820         0     1249           1329
##  8  2013     1     1      840            845        -5     1311           1350
##  9  2013     1     1      909            810        59     1331           1315
## 10  2013     1     1      913            918        -5     1346           1416
## # ... with 7,592 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

It will provide flight destinations codes that do not have a name in the airports table.

airports %>%
  #anti join flights table
  anti_join(flights, by = c("faa" = "dest"))
## # A tibble: 1,357 x 8
##    faa   name                       lat    lon   alt    tz dst   tzone          
##    <chr> <chr>                    <dbl>  <dbl> <dbl> <dbl> <chr> <chr>          
##  1 04G   Lansdowne Airport         41.1  -80.6  1044    -5 A     America/New_Yo~
##  2 06A   Moton Field Municipal A~  32.5  -85.7   264    -6 A     America/Chicago
##  3 06C   Schaumburg Regional       42.0  -88.1   801    -6 A     America/Chicago
##  4 06N   Randall Airport           41.4  -74.4   523    -5 A     America/New_Yo~
##  5 09J   Jekyll Island Airport     31.1  -81.4    11    -5 A     America/New_Yo~
##  6 0A9   Elizabethton Municipal ~  36.4  -82.2  1593    -5 A     America/New_Yo~
##  7 0G6   Williams County Airport   41.5  -84.5   730    -5 A     America/New_Yo~
##  8 0G7   Finger Lakes Regional A~  42.9  -76.8   492    -5 A     America/New_Yo~
##  9 0P2   Shoestring Aviation Air~  39.8  -76.6  1000    -5 U     America/New_Yo~
## 10 0S9   Jefferson County Intl     48.1 -123.    108    -8 A     America/Los_An~
## # ... with 1,347 more rows

It will provide names of airports that the flights table didn’t have as a destination.

6. You might expect that there’s an implicit relationship between plane and airline, because each plane is flown by a single airline. Confirm or reject this hypothesis using the tools you’ve learned above.

planes_carriers <-
  flights %>%
  #filter for planes that do not have a missing tail number
  filter(!is.na(tailnum)) %>%
  #get distinct pairs of tail number and carrier combinations
  distinct(tailnum, carrier)

tail_count <- planes_carriers %>%
  #count the tail numbers
  count(tailnum, sort = TRUE) %>%
  #filter the count for values greater than 1
  filter(n>1)

planes_carriers %>%
  #semi join tail_count to keep only planes that had a value greater than one signifying that have had more than one owner
  semi_join(tail_count, by = "tailnum") %>%
  #left join planes to get the details of the planes 
  left_join(planes, by = "tailnum") %>%
  #arrange by tailnum to see the different carriers that owned the same plane 
  arrange(tailnum)
## # A tibble: 34 x 10
##    carrier tailnum  year type     manufacturer  model engines seats speed engine
##    <chr>   <chr>   <int> <chr>    <chr>         <chr>   <int> <int> <int> <chr> 
##  1 9E      N146PQ   2007 Fixed w~ BOMBARDIER I~ CL-6~       2    95    NA Turbo~
##  2 EV      N146PQ   2007 Fixed w~ BOMBARDIER I~ CL-6~       2    95    NA Turbo~
##  3 9E      N153PQ   2007 Fixed w~ BOMBARDIER I~ CL-6~       2    95    NA Turbo~
##  4 EV      N153PQ   2007 Fixed w~ BOMBARDIER I~ CL-6~       2    95    NA Turbo~
##  5 9E      N176PQ   2008 Fixed w~ BOMBARDIER I~ CL-6~       2    95    NA Turbo~
##  6 EV      N176PQ   2008 Fixed w~ BOMBARDIER I~ CL-6~       2    95    NA Turbo~
##  7 9E      N181PQ   2008 Fixed w~ BOMBARDIER I~ CL-6~       2    95    NA Turbo~
##  8 EV      N181PQ   2008 Fixed w~ BOMBARDIER I~ CL-6~       2    95    NA Turbo~
##  9 9E      N197PQ   2008 Fixed w~ BOMBARDIER I~ CL-6~       2    95    NA Turbo~
## 10 EV      N197PQ   2008 Fixed w~ BOMBARDIER I~ CL-6~       2    95    NA Turbo~
## # ... with 24 more rows

Seventeen planes have been used by more than one carrier.