library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map

Exercise 13.4.6

1. Compute the average delay by destination, then join on the airports data frame so you can show the spatial distribution of delays.

#average_delays join on airports 
average_delays <-
  flights %>%
  filter(arr_delay > 0) %>%       #filter out zero or negative values in arr_delay because they are not delay
  group_by(dest) %>%              
  summarise(avg_delay = mean(arr_delay), na.rm = TRUE) %>%      #summarize the average arrival delay of each airport
  left_join(airports, by = c("dest" = "faa"))     
#mapping average_delays
average_delays %>%
    ggplot(aes(x = lon, y = lat, colour = avg_delay)) +
    borders("state") +
    geom_point() +
    labs(x = "Lontitude", y = "Latitude", title = "Average Delay by Airport")
## Warning: Removed 4 rows containing missing values (geom_point).

    coord_quickmap()
## <ggproto object: Class CoordQuickmap, CoordCartesian, Coord, gg>
##     aspect: function
##     backtransform_range: function
##     clip: on
##     default: FALSE
##     distance: function
##     expand: TRUE
##     is_free: function
##     is_linear: function
##     labels: function
##     limits: list
##     modify_scales: function
##     range: function
##     render_axis_h: function
##     render_axis_v: function
##     render_bg: function
##     render_fg: function
##     setup_data: function
##     setup_layout: function
##     setup_panel_guides: function
##     setup_panel_params: function
##     setup_params: function
##     train_panel_guides: function
##     transform: function
##     super:  <ggproto object: Class CoordQuickmap, CoordCartesian, Coord, gg>

2. Add the location of the origin and destination (i.e. the lat and lon) to flights.

#select latitude and longtitude from airport
airports_location <-
  airports %>%
  select(faa, lon, lat)
#add latitude and longtitude of origin and destination to flights by left joining airports_location
flights1 <- flights %>%
  left_join(airports_location, by = c("dest" = "faa")) %>%
  left_join(airports_location, by = c("origin" = "faa"), suffix = c(".dest", ".origin"))
head(flights1)
## # A tibble: 6 x 23
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # ... with 15 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, lon.dest <dbl>, lat.dest <dbl>,
## #   lon.origin <dbl>, lat.origin <dbl>

3. Is there a relationship between the age of a plane and its delays?

#rename variable year in planes to plane_year so it's distinct from year in flights
planes1 <- planes %>%
  select(tailnum:year) %>%
  rename(plane_year = "year")
head(planes1)
## # A tibble: 6 x 2
##   tailnum plane_year
##   <chr>        <int>
## 1 N10156        2004
## 2 N102UW        1998
## 3 N103US        1999
## 4 N104UW        1999
## 5 N10575        2002
## 6 N105UW        1999
flights %>%
  left_join(planes1, by = "tailnum") %>%       #left join flights and planes1
  mutate(plane_age = year - plane_year) %>%    #create a new variable of plane_age 
  filter(arr_delay > 0) %>%                    #filter out non-delay entries
  group_by(plane_age) %>%
  summarise(arr_delay_mean = mean(arr_delay)) %>%      #calculate the average arrival delay per plane_age
  ggplot(mapping = aes(x = plane_age, y = arr_delay_mean)) +
  geom_point() +
  labs(x = "Airplane Age", y = "Average Arrival Delay in Minutes", title = "Airplane Age vs. Average Arrival Delay")
## Warning: Removed 1 rows containing missing values (geom_point).

flights %>%
  left_join(planes1, by = "tailnum") %>%       #left join flights and planes1
  mutate(plane_age = year - plane_year) %>%    #create a new variable of plane_age 
  filter(dep_delay > 0) %>%                    #filter out non-delay entries
  group_by(plane_age) %>%
  summarise(dep_delay_mean = mean(dep_delay)) %>%      #calculate the average departure delay per plane_age
  ggplot(mapping = aes(x = plane_age, y = dep_delay_mean)) +
  geom_point()+
  labs(x = "Airplane Age", y = "Average Departure Delay in Minutes", title = "Airplane Age vs. Average Departure Delay")
## Warning: Removed 1 rows containing missing values (geom_point).

No, there’s no strong relationship between the age of plane and its delays.

4. What weather conditions make it more likely to see a delay?

head(weather)
## # A tibble: 6 x 15
##   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed wind_gust
##   <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>     <dbl>
## 1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4         NA
## 2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06        NA
## 3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5         NA
## 4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7         NA
## 5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7         NA
## 6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5         NA
## # ... with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## #   time_hour <dttm>
#create a new data frame by left joinining flights and weather
flights2 <- flights %>%
  left_join(weather, by = c("year", "month", "day", "origin" ))
head(flights2)
## # A tibble: 6 x 30
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      517            515         2      830            819
## 3  2013     1     1      517            515         2      830            819
## 4  2013     1     1      517            515         2      830            819
## 5  2013     1     1      517            515         2      830            819
## 6  2013     1     1      517            515         2      830            819
## # ... with 22 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour.x <dbl>, minute <dbl>, time_hour.x <dttm>, hour.y <int>, temp <dbl>,
## #   dewp <dbl>, humid <dbl>, wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour.y <dttm>
flights2 %>%
  group_by(precip) %>%        
  filter(dep_delay > 0) %>%
  summarise(dep_delay_mean = mean(dep_delay)) %>%    #calculate the average departure delay grouped by precipitation in inches
  ggplot(mapping = aes(x = precip, y = dep_delay_mean)) +
  geom_point() +
  labs(x = "Precipitation in Inches", y = "Average Departure Delay Time in Minutes", title = "Precipitation vs. Delay")
## Warning: Removed 1 rows containing missing values (geom_point).

There’s no strong relationship between precipitation and delay.

flights2 %>%
  group_by(pressure) %>%        
  filter(dep_delay > 0) %>%
  summarise(dep_delay_mean = mean(dep_delay)) %>%    #calculate the average departure delay grouped by sea level pressure
  ggplot(mapping = aes(x = pressure, y = dep_delay_mean)) +
  geom_point()+
  geom_smooth() +
  labs(x = "Sea Level Pressure in Millibars", y = "Average Departure Delay in Minutes", title = "Pressure vs. Delay")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

There’s a negative relationship between sea level pressure and delay time. Days with lower pressure weather condition tends to have more delay time.

flights2 %>%
  group_by(visib) %>%        
  filter(dep_delay > 0) %>%
  summarise(dep_delay_mean = mean(dep_delay)) %>%    #calculate the average departure delay grouped by visibility
  ggplot(mapping = aes(x = visib, y = dep_delay_mean)) +
  geom_point()+
  geom_smooth() +
  labs(x = "Visibility in Miles", y = "Average Departure Delay in Minutes", title = "Visibility vs. Delay")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

There’s a negative relationship between visibility and delay time as well. Low visibility contributes to delay time.

5. What happened on June 13 2013? Display the spatial pattern of delays, and then use Google to cross-reference with the weather.

flights %>%
  filter(year == 2013, month == 6, day == 13, arr_delay > 0) %>%
  group_by(dest) %>%
  summarise(arr_delay_mean = mean(arr_delay)) %>%
  left_join(airports, by = c("dest" = "faa")) %>%
  ggplot(aes(x = lon, y = lat, color = arr_delay_mean)) +
  borders("state")+ 
  geom_point()+
  labs(x = "Longtitude", y = "Latitude", title = "Average Delay Time by Airports on June 13, 2013") +
  coord_quickmap() 
## Warning: Removed 3 rows containing missing values (geom_point).

On June 13, 2013, strong stomrs caused 900 flights canceled and 7,400 flights delayed.

Exercise 13.5.1

1. What does it mean for a flight to have a missing tailnum? What do the tail numbers that don’t have a matching record in planes have in common? (Hint: one variable explains ~90% of the problems.)

#filter out all entries that have tailnum
flights %>%
  filter(is.na(tailnum))
## # A tibble: 2,512 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     2       NA           1545        NA       NA           1910
##  2  2013     1     2       NA           1601        NA       NA           1735
##  3  2013     1     3       NA            857        NA       NA           1209
##  4  2013     1     3       NA            645        NA       NA            952
##  5  2013     1     4       NA            845        NA       NA           1015
##  6  2013     1     4       NA           1830        NA       NA           2044
##  7  2013     1     5       NA            840        NA       NA           1001
##  8  2013     1     7       NA            820        NA       NA            958
##  9  2013     1     8       NA           1645        NA       NA           1838
## 10  2013     1     9       NA            755        NA       NA           1012
## # ... with 2,502 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
  anti_join(planes, by = "tailnum") %>%
  count(tailnum, sort = TRUE)
## # A tibble: 722 x 2
##    tailnum     n
##    <chr>   <int>
##  1 <NA>     2512
##  2 N725MQ    575
##  3 N722MQ    513
##  4 N723MQ    507
##  5 N713MQ    483
##  6 N735MQ    396
##  7 N0EGMQ    371
##  8 N534MQ    364
##  9 N542MQ    363
## 10 N531MQ    349
## # ... with 712 more rows

If a flight is missing tailnum, it’s also missing dep_time, arr_time, dep_delay, arr_delay, and airtime. It means the flight was canceled.

2. Filter flights to only show flights with planes that have flown at least 100 flights.

#filter out na values in tailnum and count tailnum and select the ones that count >= 100
planes_100f <- flights %>%
  filter(!is.na(tailnum)) %>%
  group_by(tailnum) %>%
  count(sort = TRUE) %>%
  filter(n>=100)
#flights semi join planes_100f
flights %>%
  semi_join(planes_100f, by = "tailnum")
## # A tibble: 228,390 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      544            545        -1     1004           1022
##  4  2013     1     1      554            558        -4      740            728
##  5  2013     1     1      555            600        -5      913            854
##  6  2013     1     1      557            600        -3      709            723
##  7  2013     1     1      557            600        -3      838            846
##  8  2013     1     1      558            600        -2      849            851
##  9  2013     1     1      558            600        -2      853            856
## 10  2013     1     1      558            600        -2      923            937
## # ... with 228,380 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

3. Combine fueleconomy::vehicles and fueleconomy::common to find only the records for the most common models.

head(fueleconomy::vehicles)
## # A tibble: 6 x 12
##      id make  model    year class   trans  drive     cyl displ fuel    hwy   cty
##   <dbl> <chr> <chr>   <dbl> <chr>   <chr>  <chr>   <dbl> <dbl> <chr> <dbl> <dbl>
## 1 13309 Acura 2.2CL/~  1997 Subcom~ Autom~ Front-~     4   2.2 Regu~    26    20
## 2 13310 Acura 2.2CL/~  1997 Subcom~ Manua~ Front-~     4   2.2 Regu~    28    22
## 3 13311 Acura 2.2CL/~  1997 Subcom~ Autom~ Front-~     6   3   Regu~    26    18
## 4 14038 Acura 2.3CL/~  1998 Subcom~ Autom~ Front-~     4   2.3 Regu~    27    19
## 5 14039 Acura 2.3CL/~  1998 Subcom~ Manua~ Front-~     4   2.3 Regu~    29    21
## 6 14040 Acura 2.3CL/~  1998 Subcom~ Autom~ Front-~     6   3   Regu~    26    17
head(fueleconomy::common)
## # A tibble: 6 x 4
##   make  model       n years
##   <chr> <chr>   <int> <int>
## 1 Acura Integra    42    16
## 2 Acura Legend     28    10
## 3 Acura MDX 4WD    12    12
## 4 Acura NSX        28    14
## 5 Acura TSX        27    11
## 6 Audi  A4         49    19
fueleconomy::vehicles %>%
  semi_join(fueleconomy::common, by = c("make", "model")) %>% 
  distinct(make, model)   
## # A tibble: 347 x 2
##    make  model           
##    <chr> <chr>           
##  1 Acura Integra         
##  2 Acura Legend          
##  3 Acura MDX 4WD         
##  4 Acura NSX             
##  5 Acura TSX             
##  6 Audi  A4              
##  7 Audi  A4 Avant quattro
##  8 Audi  A4 quattro      
##  9 Audi  A6              
## 10 Audi  A6 Avant quattro
## # ... with 337 more rows

4. Find the 48 hours (over the course of the whole year) that have the worst delays. Cross-reference it with the weather data. Can you see any patterns?

#summarize average departure delay grouped by origin, year, month, day, and hour
flights3 <- flights %>%
  filter(dep_delay > 0) %>%
  group_by(origin, year, month, day, hour) %>%
  summarise(delay = mean(dep_delay)) %>%
  arrange(desc(delay))
## `summarise()` has grouped output by 'origin', 'year', 'month', 'day'. You can override using the `.groups` argument.
#select the top 48 rows from flights3 which represent top 48 highest delay hour
worst_48hr <- head(flights3, 48)
library(DataExplorer)
#worst_48hr left join with weather
worst_48hr_weather <- worst_48hr %>%
  left_join(weather, by = c("origin", "year", "month", "day", "hour") ) 
plot_correlation(worst_48hr_weather)
## 1 features with more than 20 categories ignored!
## time_hour: 45 categories
## Warning in cor(x = structure(list(year = c(2013L, 2013L, 2013L, 2013L, 2013L, :
## the standard deviation is zero
## Warning: Removed 116 rows containing missing values (geom_text).

I can’t find any strong correlation between delay time and all variables from weather data frame.

5. What does anti_join(flights, airports, by = c(“dest” = “faa”)) tell you? What does anti_join(airports, flights, by = c(“faa” = “dest”)) tell you?

anti_join(flights, airports, by = c("dest" = "faa")) %>%
  distinct(dest)
## # A tibble: 4 x 1
##   dest 
##   <chr>
## 1 BQN  
## 2 SJU  
## 3 STT  
## 4 PSE

anti_join(flights, airports, by = c(“dest” = “faa”)) tells flights whose destination is not in the airports data frame. There are only four airports: BQN, SJU, STT, PSE. These are airports in Puerto Rico and Virgin Islands.

anti_join(airports, flights, by = c("faa" = "dest"))
## # A tibble: 1,357 x 8
##    faa   name                       lat    lon   alt    tz dst   tzone          
##    <chr> <chr>                    <dbl>  <dbl> <dbl> <dbl> <chr> <chr>          
##  1 04G   Lansdowne Airport         41.1  -80.6  1044    -5 A     America/New_Yo~
##  2 06A   Moton Field Municipal A~  32.5  -85.7   264    -6 A     America/Chicago
##  3 06C   Schaumburg Regional       42.0  -88.1   801    -6 A     America/Chicago
##  4 06N   Randall Airport           41.4  -74.4   523    -5 A     America/New_Yo~
##  5 09J   Jekyll Island Airport     31.1  -81.4    11    -5 A     America/New_Yo~
##  6 0A9   Elizabethton Municipal ~  36.4  -82.2  1593    -5 A     America/New_Yo~
##  7 0G6   Williams County Airport   41.5  -84.5   730    -5 A     America/New_Yo~
##  8 0G7   Finger Lakes Regional A~  42.9  -76.8   492    -5 A     America/New_Yo~
##  9 0P2   Shoestring Aviation Air~  39.8  -76.6  1000    -5 U     America/New_Yo~
## 10 0S9   Jefferson County Intl     48.1 -123.    108    -8 A     America/Los_An~
## # ... with 1,347 more rows

anti_join(airports, flights, by = c(“faa” = “dest”)) tells us airports that are not appear in flights data frame. In other words, these are airports that NY airports doesn’t have direct flight to.

6. You might expect that there’s an implicit relationship between plane and airline, because each plane is flown by a single airline. Confirm or reject this hypothesis using the tools you’ve learned above.

flights %>%
  filter(!is.na(tailnum)) %>%   #filter out canceled flights
  distinct(carrier, tailnum) %>%  #find out distinct carrier and tailnum combination
  count(tailnum) %>%              #count tailnum to see if any tailnum appear more than once
  filter(n>1)
## # A tibble: 17 x 2
##    tailnum     n
##    <chr>   <int>
##  1 N146PQ      2
##  2 N153PQ      2
##  3 N176PQ      2
##  4 N181PQ      2
##  5 N197PQ      2
##  6 N200PQ      2
##  7 N228PQ      2
##  8 N232PQ      2
##  9 N933AT      2
## 10 N935AT      2
## 11 N977AT      2
## 12 N978AT      2
## 13 N979AT      2
## 14 N981AT      2
## 15 N989AT      2
## 16 N990AT      2
## 17 N994AT      2

This hypothesis is wrong. There are 17 planes are flown by two different carriers.