library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
airports %>%
semi_join(flights, c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point() +
coord_quickmap()
avg_dest_delays <-
flights %>%
group_by(dest) %>%
summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
inner_join(airports, by = c(dest = "faa"))
avg_dest_delays %>%
ggplot(aes(lon, lat, colour = delay)) +
borders("state") +
geom_point() +
coord_quickmap() +
ylab("Latitude") +
xlab("Longitude")+
ggtitle("Average Delay by Arrival Times")
airport_locations <- airports %>%
select(faa, lat, lon)
flights %>%
select(origin, dest) %>%
left_join(airport_locations, by = c("origin" = "faa")) %>%
left_join(airport_locations, by = c("dest" = "faa"), suffix = c(".origin", ".dest"))
## # A tibble: 336,776 x 6
## origin dest lat.origin lon.origin lat.dest lon.dest
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 EWR IAH 40.7 -74.2 30.0 -95.3
## 2 LGA IAH 40.8 -73.9 30.0 -95.3
## 3 JFK MIA 40.6 -73.8 25.8 -80.3
## 4 JFK BQN 40.6 -73.8 NA NA
## 5 LGA ATL 40.8 -73.9 33.6 -84.4
## 6 EWR ORD 40.7 -74.2 42.0 -87.9
## 7 EWR FLL 40.7 -74.2 26.1 -80.2
## 8 LGA IAD 40.8 -73.9 38.9 -77.5
## 9 JFK MCO 40.6 -73.8 28.4 -81.3
## 10 LGA ORD 40.8 -73.9 42.0 -87.9
## # … with 336,766 more rows
plane_ages <-
planes %>%
mutate(age = 2013 - year) %>%
select(tailnum, age)
flights %>%
inner_join(plane_ages, by = "tailnum") %>%
group_by(age) %>%
filter(!is.na(dep_delay)) %>%
summarise(delay = mean(dep_delay)) %>%
ggplot(aes(x = age, y = delay)) +
geom_point() +
geom_line()
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
There does not appear to be a relationship between the age of a plane and its delays. From 0 to 10 years, there was a strong positives correlation, then the trend reverse, causing a negative correlation between plane age and arrival delay from 10 to 30 years.
flight_weather <- flights %>%
inner_join(weather,by=c("origin", "year", "month", "day", "hour" ))
flight_weather %>%
group_by(precip) %>%
summarise(delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x = precip, y = delay)) +
geom_line() + geom_point() +
ylab("Delays") +
xlab("Precipitation")+
ggtitle("Precipitation vs Delays")
flight_weather %>%
group_by(wind_gust) %>%
summarise(delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x = wind_gust, y = delay)) +
geom_line() + geom_point() +
ylab("Delays") +
xlab("Wind Gust")+
ggtitle("Wind Gust vs Delays")
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
flight_weather %>%
group_by(visib) %>%
summarise(delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x = visib, y = delay)) +
geom_line() + geom_point() +
ylab("Delays") +
xlab("Visibility")+
ggtitle("Visibility vs Delays")
Precipitation and departure delays seem to have a positive correlation with departure delays. Higher levels of precipitation correspond to delayed flights.
Wind Gust and departure delays seem to have a positive correlation with departure delays. higher levels of wind gust correspond to delayed flights.
Visibility seem to have a negative correlation with departure delays. Low visibility correspond to flight more likely to be delayed.
flights %>%
filter(year == 2013, month == 6, day == 13) %>%
group_by(dest) %>%
summarize(delay = mean(arr_delay, na.rm = TRUE)) %>%
inner_join(airports, by = c("dest" = "faa")) %>%
ggplot(aes(x = lon, y = lat, size = delay, colour = delay)) +
borders("state") +
geom_point() +
coord_quickmap() +
xlab("Latitude") +
ylab("Longitude") +
ggtitle(" Average Delay by Arrival Times on June 13, 2013")
## Warning: Removed 3 rows containing missing values (geom_point).
The East side and the Midwest of the United States were affected by the storms.
flights %>%
filter(is.na(tailnum))
## # A tibble: 2,512 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 2 NA 1545 NA NA 1910
## 2 2013 1 2 NA 1601 NA NA 1735
## 3 2013 1 3 NA 857 NA NA 1209
## 4 2013 1 3 NA 645 NA NA 952
## 5 2013 1 4 NA 845 NA NA 1015
## 6 2013 1 4 NA 1830 NA NA 2044
## 7 2013 1 5 NA 840 NA NA 1001
## 8 2013 1 7 NA 820 NA NA 958
## 9 2013 1 8 NA 1645 NA NA 1838
## 10 2013 1 9 NA 755 NA NA 1012
## # … with 2,502 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
anti_join(planes, by = "tailnum") %>%
count(carrier, sort = TRUE)
## # A tibble: 10 x 2
## carrier n
## <chr> <int>
## 1 MQ 25397
## 2 AA 22558
## 3 UA 1693
## 4 9E 1044
## 5 B6 830
## 6 US 699
## 7 FL 187
## 8 DL 110
## 9 F9 50
## 10 WN 38
Flights with missing tail numbers have missing departure and arrival times. It can assumed that the flights without tail numbers were grounded and didn’t leave the origin airport.
flights %>%
group_by(tailnum) %>%
filter(n() > 100)
## # A tibble: 229,202 x 19
## # Groups: tailnum [1,201]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 544 545 -1 1004 1022
## 4 2013 1 1 554 558 -4 740 728
## 5 2013 1 1 555 600 -5 913 854
## 6 2013 1 1 557 600 -3 709 723
## 7 2013 1 1 557 600 -3 838 846
## 8 2013 1 1 558 600 -2 849 851
## 9 2013 1 1 558 600 -2 853 856
## 10 2013 1 1 558 600 -2 923 937
## # … with 229,192 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
library(fueleconomy)
fueleconomy::vehicles %>%
semi_join(fueleconomy::common, by = c("make", "model"))
## # A tibble: 14,531 x 12
## id make model year class trans drive cyl displ fuel hwy cty
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1833 Acura Integ… 1986 Subcom… Autom… Front-… 4 1.6 Regu… 28 22
## 2 1834 Acura Integ… 1986 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 3 3037 Acura Integ… 1987 Subcom… Autom… Front-… 4 1.6 Regu… 28 22
## 4 3038 Acura Integ… 1987 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 5 4183 Acura Integ… 1988 Subcom… Autom… Front-… 4 1.6 Regu… 27 22
## 6 4184 Acura Integ… 1988 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 7 5303 Acura Integ… 1989 Subcom… Autom… Front-… 4 1.6 Regu… 27 22
## 8 5304 Acura Integ… 1989 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 9 6442 Acura Integ… 1990 Subcom… Autom… Front-… 4 1.8 Regu… 24 20
## 10 6443 Acura Integ… 1990 Subcom… Manua… Front-… 4 1.8 Regu… 26 21
## # … with 14,521 more rows
delay_weather <- flights %>%
inner_join(weather, by= c("origin", "year", "month", "day", "hour")) %>%
arrange(desc(dep_delay)) %>%
slice(1:48)
delay_weather
## # A tibble: 48 x 29
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # … with 38 more rows, and 21 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour.x <dttm>, temp <dbl>,
## # dewp <dbl>, humid <dbl>, wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour.y <dttm>
summary(delay_weather)
## year month day dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 12.0
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.50 1st Qu.: 693.0
## Median :2013 Median : 6.000 Median :14.00 Median : 935.5
## Mean :2013 Mean : 6.292 Mean :13.92 Mean :1220.6
## 3rd Qu.:2013 3rd Qu.: 9.000 3rd Qu.:19.00 3rd Qu.:1971.8
## Max. :2013 Max. :12.000 Max. :27.00 Max. :2346.0
##
## sched_dep_time dep_delay arr_time sched_arr_time
## Min. : 615 Min. : 545.0 Min. : 17.0 Min. : 32
## 1st Qu.:1030 1st Qu.: 635.5 1st Qu.: 791.5 1st Qu.:1296
## Median :1672 Median : 787.5 Median :1013.0 Median :1924
## Mean :1482 Mean : 777.0 Mean :1049.5 Mean :1700
## 3rd Qu.:1845 3rd Qu.: 853.0 3rd Qu.:1293.0 3rd Qu.:2060
## Max. :2100 Max. :1301.0 Max. :2308.0 Max. :2249
## NA's :1
## arr_delay carrier flight tailnum
## Min. : 538.0 Length:48 Min. : 23.0 Length:48
## 1st Qu.: 638.5 Class :character 1st Qu.: 395.8 Class :character
## Median : 780.0 Mode :character Median :1706.0 Mode :character
## Mean : 772.6 Mean :1607.5
## 3rd Qu.: 851.5 3rd Qu.:2343.0
## Max. :1272.0 Max. :4711.0
## NA's :1
## origin dest air_time distance
## Length:48 Length:48 Min. : 41.0 Min. : 184.0
## Class :character Class :character 1st Qu.:111.5 1st Qu.: 756.5
## Mode :character Mode :character Median :145.0 Median :1020.0
## Mean :185.7 Mean :1344.0
## 3rd Qu.:227.5 3rd Qu.:1707.2
## Max. :640.0 Max. :4983.0
## NA's :1
## hour minute time_hour.x temp
## Min. : 6.0 Min. : 0.00 Min. :2013-01-01 18:00:00 Min. :23.00
## 1st Qu.:10.0 1st Qu.: 3.75 1st Qu.:2013-04-09 04:45:00 1st Qu.:41.54
## Median :16.5 Median :25.00 Median :2013-06-27 18:00:00 Median :58.01
## Mean :14.6 Mean :21.50 Mean :2013-06-23 16:58:45 Mean :57.64
## 3rd Qu.:18.0 3rd Qu.:30.00 3rd Qu.:2013-09-12 14:00:00 3rd Qu.:77.27
## Max. :21.0 Max. :59.00 Max. :2013-12-19 17:00:00 Max. :89.06
##
## dewp humid wind_dir wind_speed
## Min. :-0.04 Min. : 26.89 Min. : 0.0 Min. : 0.000
## 1st Qu.:27.73 1st Qu.: 55.35 1st Qu.:137.5 1st Qu.: 8.055
## Median :52.52 Median : 73.25 Median :180.0 Median :10.357
## Mean :47.08 Mean : 70.28 Mean :191.9 Mean :12.563
## 3rd Qu.:69.31 3rd Qu.: 83.79 3rd Qu.:267.5 3rd Qu.:16.111
## Max. :73.40 Max. :100.00 Max. :360.0 Max. :33.373
##
## wind_gust precip pressure visib
## Min. :24.17 Min. :0.0000 Min. :1005 Min. : 0.120
## 1st Qu.:25.32 1st Qu.:0.0000 1st Qu.:1011 1st Qu.: 8.000
## Median :25.32 Median :0.0000 Median :1017 Median :10.000
## Mean :29.06 Mean :0.0075 Mean :1017 Mean : 8.015
## 3rd Qu.:33.37 3rd Qu.:0.0000 3rd Qu.:1021 3rd Qu.:10.000
## Max. :40.28 Max. :0.1400 Max. :1033 Max. :10.000
## NA's :40 NA's :15
## time_hour.y
## Min. :2013-01-01 18:00:00
## 1st Qu.:2013-04-09 04:45:00
## Median :2013-06-27 18:00:00
## Mean :2013-06-23 16:58:45
## 3rd Qu.:2013-09-12 14:00:00
## Max. :2013-12-19 17:00:00
##
All variables were higher on average times and days.
flights %>%
anti_join(airports, by = c("dest" = "faa"))
## # A tibble: 7,602 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 544 545 -1 1004 1022
## 2 2013 1 1 615 615 0 1039 1100
## 3 2013 1 1 628 630 -2 1137 1140
## 4 2013 1 1 701 700 1 1123 1154
## 5 2013 1 1 711 715 -4 1151 1206
## 6 2013 1 1 820 820 0 1254 1310
## 7 2013 1 1 820 820 0 1249 1329
## 8 2013 1 1 840 845 -5 1311 1350
## 9 2013 1 1 909 810 59 1331 1315
## 10 2013 1 1 913 918 -5 1346 1416
## # … with 7,592 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
anti_join(flights, airports, by = c(“dest” = “faa”)) are flights that go to an airport that is not in FAA list of destinations.
airports %>%
anti_join(flights, by = c("faa" = "dest"))
## # A tibble: 1,357 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_Yo…
## 2 06A Moton Field Municipal A… 32.5 -85.7 264 -6 A America/Chicago
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chicago
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_Yo…
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_Yo…
## 6 0A9 Elizabethton Municipal … 36.4 -82.2 1593 -5 A America/New_Yo…
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/New_Yo…
## 8 0G7 Finger Lakes Regional A… 42.9 -76.8 492 -5 A America/New_Yo…
## 9 0P2 Shoestring Aviation Air… 39.8 -76.6 1000 -5 U America/New_Yo…
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/Los_An…
## # … with 1,347 more rows
anti_join(airports, flights, by = c(“faa” = “dest”)) are US airports that don’t have a flight in the data, meaning that there were no flights to that aiport from New York in 2013.
plane_airline <- flights %>%
filter(!is.na(tailnum)) %>%
distinct(tailnum, carrier)
multi_airline <- plane_airline %>%
group_by(tailnum) %>%
filter(n() > 1) %>%
arrange(tailnum)
multi_airline
## # A tibble: 34 x 2
## # Groups: tailnum [17]
## carrier tailnum
## <chr> <chr>
## 1 9E N146PQ
## 2 EV N146PQ
## 3 9E N153PQ
## 4 EV N153PQ
## 5 9E N176PQ
## 6 EV N176PQ
## 7 9E N181PQ
## 8 EV N181PQ
## 9 9E N197PQ
## 10 EV N197PQ
## # … with 24 more rows
The hypothesis is rejected. There are seventeen planes that been used by more than one carrier.