library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
#average_delays join on airports
average_delays <-
flights %>%
filter(arr_delay > 0) %>% #filter out zero or negative values in arr_delay because they are not delay
group_by(dest) %>%
summarise(avg_delay = mean(arr_delay), na.rm = TRUE) %>% #summarize the average arrival delay of each airport
left_join(airports, by = c("dest" = "faa"))
#mapping average_delays
average_delays %>%
ggplot(aes(x = lon, y = lat, colour = avg_delay)) +
borders("state") +
geom_point() +
labs(x = "Lontitude", y = "Latitude", title = "Average Delay by Airport")
## Warning: Removed 4 rows containing missing values (geom_point).
coord_quickmap()
## <ggproto object: Class CoordQuickmap, CoordCartesian, Coord, gg>
## aspect: function
## backtransform_range: function
## clip: on
## default: FALSE
## distance: function
## expand: TRUE
## is_free: function
## is_linear: function
## labels: function
## limits: list
## modify_scales: function
## range: function
## render_axis_h: function
## render_axis_v: function
## render_bg: function
## render_fg: function
## setup_data: function
## setup_layout: function
## setup_panel_guides: function
## setup_panel_params: function
## setup_params: function
## train_panel_guides: function
## transform: function
## super: <ggproto object: Class CoordQuickmap, CoordCartesian, Coord, gg>
#select latitude and longtitude from airport
airports_location <-
airports %>%
select(faa, lon, lat)
#add latitude and longtitude of origin and destination to flights by left joining airports_location
flights1 <- flights %>%
left_join(airports_location, by = c("dest" = "faa")) %>%
left_join(airports_location, by = c("origin" = "faa"), suffix = c(".dest", ".origin"))
head(flights1)
## # A tibble: 6 x 23
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # ... with 15 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, lon.dest <dbl>, lat.dest <dbl>,
## # lon.origin <dbl>, lat.origin <dbl>
#rename variable year in planes to plane_year so it's distinct from year in flights
planes1 <- planes %>%
select(tailnum:year) %>%
rename(plane_year = "year")
head(planes1)
## # A tibble: 6 x 2
## tailnum plane_year
## <chr> <int>
## 1 N10156 2004
## 2 N102UW 1998
## 3 N103US 1999
## 4 N104UW 1999
## 5 N10575 2002
## 6 N105UW 1999
flights %>%
left_join(planes1, by = "tailnum") %>% #left join flights and planes1
mutate(plane_age = year - plane_year) %>% #create a new variable of plane_age
filter(arr_delay > 0) %>% #filter out non-delay entries
group_by(plane_age) %>%
summarise(arr_delay_mean = mean(arr_delay)) %>% #calculate the average arrival delay per plane_age
ggplot(mapping = aes(x = plane_age, y = arr_delay_mean)) +
geom_point() +
labs(x = "Airplane Age", y = "Average Arrival Delay in Minutes", title = "Airplane Age vs. Average Arrival Delay")
## Warning: Removed 1 rows containing missing values (geom_point).
flights %>%
left_join(planes1, by = "tailnum") %>% #left join flights and planes1
mutate(plane_age = year - plane_year) %>% #create a new variable of plane_age
filter(dep_delay > 0) %>% #filter out non-delay entries
group_by(plane_age) %>%
summarise(dep_delay_mean = mean(dep_delay)) %>% #calculate the average departure delay per plane_age
ggplot(mapping = aes(x = plane_age, y = dep_delay_mean)) +
geom_point()+
labs(x = "Airplane Age", y = "Average Departure Delay in Minutes", title = "Airplane Age vs. Average Departure Delay")
## Warning: Removed 1 rows containing missing values (geom_point).
No, there’s no strong relationship between the age of plane and its delays.
head(weather)
## # A tibble: 6 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed wind_gust
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4 NA
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06 NA
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5 NA
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7 NA
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7 NA
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5 NA
## # ... with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## # time_hour <dttm>
#create a new data frame by left joinining flights and weather
flights2 <- flights %>%
left_join(weather, by = c("year", "month", "day", "origin" ))
head(flights2)
## # A tibble: 6 x 30
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 517 515 2 830 819
## 3 2013 1 1 517 515 2 830 819
## 4 2013 1 1 517 515 2 830 819
## 5 2013 1 1 517 515 2 830 819
## 6 2013 1 1 517 515 2 830 819
## # ... with 22 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour.x <dbl>, minute <dbl>, time_hour.x <dttm>, hour.y <int>, temp <dbl>,
## # dewp <dbl>, humid <dbl>, wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour.y <dttm>
flights2 %>%
group_by(precip) %>%
filter(dep_delay > 0) %>%
summarise(dep_delay_mean = mean(dep_delay)) %>% #calculate the average departure delay grouped by precipitation in inches
ggplot(mapping = aes(x = precip, y = dep_delay_mean)) +
geom_point() +
labs(x = "Precipitation in Inches", y = "Average Departure Delay Time in Minutes", title = "Precipitation vs. Delay")
## Warning: Removed 1 rows containing missing values (geom_point).
There’s no strong relationship between precipitation and delay.
flights2 %>%
group_by(pressure) %>%
filter(dep_delay > 0) %>%
summarise(dep_delay_mean = mean(dep_delay)) %>% #calculate the average departure delay grouped by sea level pressure
ggplot(mapping = aes(x = pressure, y = dep_delay_mean)) +
geom_point()+
geom_smooth() +
labs(x = "Sea Level Pressure in Millibars", y = "Average Departure Delay in Minutes", title = "Pressure vs. Delay")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
There’s a negative relationship between sea level pressure and delay time. Days with lower pressure weather condition tends to have more delay time.
flights2 %>%
group_by(visib) %>%
filter(dep_delay > 0) %>%
summarise(dep_delay_mean = mean(dep_delay)) %>% #calculate the average departure delay grouped by visibility
ggplot(mapping = aes(x = visib, y = dep_delay_mean)) +
geom_point()+
geom_smooth() +
labs(x = "Visibility in Miles", y = "Average Departure Delay in Minutes", title = "Visibility vs. Delay")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
There’s a negative relationship between visibility and delay time as well. Low visibility contributes to delay time.
flights %>%
filter(year == 2013, month == 6, day == 13, arr_delay > 0) %>%
group_by(dest) %>%
summarise(arr_delay_mean = mean(arr_delay)) %>%
left_join(airports, by = c("dest" = "faa")) %>%
ggplot(aes(x = lon, y = lat, color = arr_delay_mean)) +
borders("state")+
geom_point()+
labs(x = "Longtitude", y = "Latitude", title = "Average Delay Time by Airports on June 13, 2013") +
coord_quickmap()
## Warning: Removed 3 rows containing missing values (geom_point).
On June 13, 2013, strong stomrs caused 900 flights canceled and 7,400 flights delayed.
#filter out all entries that have tailnum
flights %>%
filter(is.na(tailnum))
## # A tibble: 2,512 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 2 NA 1545 NA NA 1910
## 2 2013 1 2 NA 1601 NA NA 1735
## 3 2013 1 3 NA 857 NA NA 1209
## 4 2013 1 3 NA 645 NA NA 952
## 5 2013 1 4 NA 845 NA NA 1015
## 6 2013 1 4 NA 1830 NA NA 2044
## 7 2013 1 5 NA 840 NA NA 1001
## 8 2013 1 7 NA 820 NA NA 958
## 9 2013 1 8 NA 1645 NA NA 1838
## 10 2013 1 9 NA 755 NA NA 1012
## # ... with 2,502 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
anti_join(planes, by = "tailnum") %>%
count(tailnum, sort = TRUE)
## # A tibble: 722 x 2
## tailnum n
## <chr> <int>
## 1 <NA> 2512
## 2 N725MQ 575
## 3 N722MQ 513
## 4 N723MQ 507
## 5 N713MQ 483
## 6 N735MQ 396
## 7 N0EGMQ 371
## 8 N534MQ 364
## 9 N542MQ 363
## 10 N531MQ 349
## # ... with 712 more rows
If a flight is missing tailnum, it’s also missing dep_time, arr_time, dep_delay, arr_delay, and airtime. It means the flight was canceled.
#filter out na values in tailnum and count tailnum and select the ones that count >= 100
planes_100f <- flights %>%
filter(!is.na(tailnum)) %>%
group_by(tailnum) %>%
count(sort = TRUE) %>%
filter(n>=100)
#flights semi join planes_100f
flights %>%
semi_join(planes_100f, by = "tailnum")
## # A tibble: 228,390 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 544 545 -1 1004 1022
## 4 2013 1 1 554 558 -4 740 728
## 5 2013 1 1 555 600 -5 913 854
## 6 2013 1 1 557 600 -3 709 723
## 7 2013 1 1 557 600 -3 838 846
## 8 2013 1 1 558 600 -2 849 851
## 9 2013 1 1 558 600 -2 853 856
## 10 2013 1 1 558 600 -2 923 937
## # ... with 228,380 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
head(fueleconomy::vehicles)
## # A tibble: 6 x 12
## id make model year class trans drive cyl displ fuel hwy cty
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 13309 Acura 2.2CL/~ 1997 Subcom~ Autom~ Front-~ 4 2.2 Regu~ 26 20
## 2 13310 Acura 2.2CL/~ 1997 Subcom~ Manua~ Front-~ 4 2.2 Regu~ 28 22
## 3 13311 Acura 2.2CL/~ 1997 Subcom~ Autom~ Front-~ 6 3 Regu~ 26 18
## 4 14038 Acura 2.3CL/~ 1998 Subcom~ Autom~ Front-~ 4 2.3 Regu~ 27 19
## 5 14039 Acura 2.3CL/~ 1998 Subcom~ Manua~ Front-~ 4 2.3 Regu~ 29 21
## 6 14040 Acura 2.3CL/~ 1998 Subcom~ Autom~ Front-~ 6 3 Regu~ 26 17
head(fueleconomy::common)
## # A tibble: 6 x 4
## make model n years
## <chr> <chr> <int> <int>
## 1 Acura Integra 42 16
## 2 Acura Legend 28 10
## 3 Acura MDX 4WD 12 12
## 4 Acura NSX 28 14
## 5 Acura TSX 27 11
## 6 Audi A4 49 19
fueleconomy::vehicles %>%
semi_join(fueleconomy::common, by = c("make", "model")) %>%
distinct(make, model)
## # A tibble: 347 x 2
## make model
## <chr> <chr>
## 1 Acura Integra
## 2 Acura Legend
## 3 Acura MDX 4WD
## 4 Acura NSX
## 5 Acura TSX
## 6 Audi A4
## 7 Audi A4 Avant quattro
## 8 Audi A4 quattro
## 9 Audi A6
## 10 Audi A6 Avant quattro
## # ... with 337 more rows
#summarize average departure delay grouped by origin, year, month, day, and hour
flights3 <- flights %>%
filter(dep_delay > 0) %>%
group_by(origin, year, month, day, hour) %>%
summarise(delay = mean(dep_delay)) %>%
arrange(desc(delay))
## `summarise()` has grouped output by 'origin', 'year', 'month', 'day'. You can override using the `.groups` argument.
#select the top 48 rows from flights3 which represent top 48 highest delay hour
worst_48hr <- head(flights3, 48)
library(DataExplorer)
#worst_48hr left join with weather
worst_48hr_weather <- worst_48hr %>%
left_join(weather, by = c("origin", "year", "month", "day", "hour") )
plot_correlation(worst_48hr_weather)
## 1 features with more than 20 categories ignored!
## time_hour: 45 categories
## Warning in cor(x = structure(list(year = c(2013L, 2013L, 2013L, 2013L, 2013L, :
## the standard deviation is zero
## Warning: Removed 116 rows containing missing values (geom_text).
I can’t find any strong correlation between delay time and all variables from weather data frame.
anti_join(flights, airports, by = c("dest" = "faa")) %>%
distinct(dest)
## # A tibble: 4 x 1
## dest
## <chr>
## 1 BQN
## 2 SJU
## 3 STT
## 4 PSE
anti_join(flights, airports, by = c(“dest” = “faa”)) tells flights whose destination is not in the airports data frame. There are only four airports: BQN, SJU, STT, PSE. These are airports in Puerto Rico and Virgin Islands.
anti_join(airports, flights, by = c("faa" = "dest"))
## # A tibble: 1,357 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_Yo~
## 2 06A Moton Field Municipal A~ 32.5 -85.7 264 -6 A America/Chicago
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chicago
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_Yo~
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_Yo~
## 6 0A9 Elizabethton Municipal ~ 36.4 -82.2 1593 -5 A America/New_Yo~
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/New_Yo~
## 8 0G7 Finger Lakes Regional A~ 42.9 -76.8 492 -5 A America/New_Yo~
## 9 0P2 Shoestring Aviation Air~ 39.8 -76.6 1000 -5 U America/New_Yo~
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/Los_An~
## # ... with 1,347 more rows
anti_join(airports, flights, by = c(“faa” = “dest”)) tells us airports that are not appear in flights data frame. In other words, these are airports that NY airports doesn’t have direct flight to.
flights %>%
filter(!is.na(tailnum)) %>% #filter out canceled flights
distinct(carrier, tailnum) %>% #find out distinct carrier and tailnum combination
count(tailnum) %>% #count tailnum to see if any tailnum appear more than once
filter(n>1)
## # A tibble: 17 x 2
## tailnum n
## <chr> <int>
## 1 N146PQ 2
## 2 N153PQ 2
## 3 N176PQ 2
## 4 N181PQ 2
## 5 N197PQ 2
## 6 N200PQ 2
## 7 N228PQ 2
## 8 N232PQ 2
## 9 N933AT 2
## 10 N935AT 2
## 11 N977AT 2
## 12 N978AT 2
## 13 N979AT 2
## 14 N981AT 2
## 15 N989AT 2
## 16 N990AT 2
## 17 N994AT 2
This hypothesis is wrong. There are 17 planes are flown by two different carriers.