library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.3
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("nycflights13")
library("fueleconomy")
head(flights)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
head(airports)
## # A tibble: 6 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_Y…
## 2 06A Moton Field Municipal Airp… 32.5 -85.7 264 -6 A America/Chica…
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chica…
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_Y…
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_Y…
## 6 0A9 Elizabethton Municipal Air… 36.4 -82.2 1593 -5 A America/New_Y…
head(weather)
## # A tibble: 6 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed wind_gust
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4 NA
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06 NA
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5 NA
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7 NA
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7 NA
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5 NA
## # … with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## # time_hour <dttm>
airports %>%
semi_join(flights, c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point() +
xlab("Longitude") +
ylab("Latitude")+
coord_quickmap()
p1.13.4.6 <- flights %>% group_by(dest) %>%
summarize(Average_Delay = mean(arr_delay, na.rm = TRUE)) %>%
#inner vs left or right join in this case
inner_join(airports, by = c('dest' = 'faa')) %>%
ggplot(aes(x = lon, y = lat, color = Average_Delay)) +
borders('state') +
geom_point() +
labs(title = "Average Flight Delay by Destination",
subtitle = "Spatial distribution of delays" ,
caption = "Dataset: nycflights13") +
xlab("Longitude") +
ylab("Latitude") +
scale_color_gradient(low = "#ffd60a", high = "#03045e") +
coord_quickmap()
p1.13.4.6
q2.13.4.6 <- flights %>%
left_join(airports, c(origin = "faa")) %>%
#each time we left join it copies airport fields into flights, suffix to differentiate
left_join(airports, c(dest = "faa"), suffix = c('.origin', '.dest')) %>%
#left_join(airports, c(dest = "faa"))
select(dest, origin, contains('lat'), contains('lon'))
q2.13.4.6
## # A tibble: 336,776 x 6
## dest origin lat.origin lat.dest lon.origin lon.dest
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 IAH EWR 40.7 30.0 -74.2 -95.3
## 2 IAH LGA 40.8 30.0 -73.9 -95.3
## 3 MIA JFK 40.6 25.8 -73.8 -80.3
## 4 BQN JFK 40.6 NA -73.8 NA
## 5 ATL LGA 40.8 33.6 -73.9 -84.4
## 6 ORD EWR 40.7 42.0 -74.2 -87.9
## 7 FLL EWR 40.7 26.1 -74.2 -80.2
## 8 IAD LGA 40.8 38.9 -73.9 -77.5
## 9 MCO JFK 40.6 28.4 -73.8 -81.3
## 10 ORD LGA 40.8 42.0 -73.9 -87.9
## # … with 336,766 more rows
After fitting a linear regression line to the age versus delays plot where age is the explanatory variable and delays is the response variable it appears that delay seems to decrease slightly with the age of a plane.
plane_age <- planes %>%
mutate(age = 2013 - year) %>%
select(tailnum, age)
head(plane_age)
## # A tibble: 6 x 2
## tailnum age
## <chr> <dbl>
## 1 N10156 9
## 2 N102UW 15
## 3 N103US 14
## 4 N104UW 14
## 5 N10575 11
## 6 N105UW 14
p3.13.4.6 <- flights %>%
#can also do, by = "tailnum"
inner_join(plane_age, by = c("tailnum")) %>%
group_by(age) %>%
filter(!is.na(dep_delay) & !is.na(age)) %>%
#why does mean(arr_delay) give me weird data
summarise(delay = mean(dep_delay)) %>%
ggplot(aes(x = age, y = delay)) +
geom_point(color = "#001f54")+
geom_smooth(method = lm, color = "#c9a227") +
labs(title = "Flight Delay by Age of a Plane",
subtitle = "Relationship between the age of a plane and its delays" ,
caption = "Dataset: nycflights13") +
xlab("Age of the Plane (in years)") +
ylab("Delay") +
geom_line()
p3.13.4.6
## `geom_smooth()` using formula 'y ~ x'
Just looking at the individual bi-variate relationship between departure delay and each weather condition, there does not appear to be any strong correlation. While precipitation seems to make the largest difference, it is still not as prominent making it tough to use weather condition as a good predictor for flight delay.
p4.13.4.6 <- flights %>%
#why does by = "year" give a limit error?
left_join(weather, by = c('year','month','day','hour', 'origin', 'time_hour')) %>%
filter(!is.na(temp) & !is.na(dewp) & !is.na(humid) &
!is.na(precip) & !is.na(pressure) & !is.na(visib) &
!is.na(wind_dir) & !is.na(wind_gust) & !is.na(wind_speed)) %>%
gather(condition, value, c(temp, dewp, humid, precip, pressure, visib, wind_dir, wind_gust, wind_speed)) %>%
filter(!is.na(dep_delay)) %>%
ggplot(mapping = aes(x = value, y = dep_delay, color = condition)) +
geom_point() +
labs(title = "Flight Delay by Weather Condition",
subtitle = "Weather conditions that are more likely to see a delay" ,
caption = "Dataset: nycflights13") +
xlab("Weather Stats") +
ylab("Delay") +
facet_wrap(~condition, ncol = 3, scale = "free") +
scale_color_manual( name = "Weather Condition",
labels = c("Dew Point", "Humidity", "Precipitation", "Pressure", "Temperature", "Visibility", "Wind Direction", "Wind Speed"),
values = c("#edc531", "#dbb42c", "#c9a227", "#682a92", "#5d2689", "#522882", "#065a82", "#1b3b6f","#21295c" ))
p4.13.4.6
According to the spatial pattern of delays there appears to be large delays in the southeast region. Cross-referencing this information reveals that nationwide, more than 900 flights were canceled and 7,400 delayed, according to flight-tracking service FlightStats due to a line of strong storms moving in the mid-Atlantic on June 13, 2013.
p5.13.4.6 <- flights %>%
filter(year == 2013, month == 6, day == 13) %>%
group_by(dest) %>%
summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
inner_join(airports, by = c("dest" = "faa")) %>%
ggplot(aes(y = lat, x = lon, size = delay, colour = delay)) +
borders("state") +
geom_point(alpha = 0.9) +
labs(title = "Spatial Pattern of Flights Delayed",
subtitle = "On June 13, 2013",
caption = "Dataset: nycflights13") +
xlab("Longitude") +
ylab("Latitude") +
coord_quickmap() +
scale_color_gradient2(low = "#ffd60a", mid = "#ffd60a", high = "#03045e")
p5.13.4.6
## Warning: Removed 3 rows containing missing values (geom_point).
Flights that have a missing tail number all have missing values in the arrival time column, meaning that the flight was canceled. For those tailnum that don’t have a matching record in plane, it seems most of them come from the same two carriers.
q1a.13.5.1 <- flights %>%
filter(is.na(tailnum), !is.na(arr_time)) %>%
nrow()
q1a.13.5.1
## [1] 0
q1b.13.5.1 <- flights %>%
anti_join(planes, by = 'tailnum') %>%
group_by(carrier) %>%
summarize(n = n()) %>%
arrange(desc(n))
q1b.13.5.1
## # A tibble: 10 x 2
## carrier n
## <chr> <int>
## 1 MQ 25397
## 2 AA 22558
## 3 UA 1693
## 4 9E 1044
## 5 B6 830
## 6 US 699
## 7 FL 187
## 8 DL 110
## 9 F9 50
## 10 WN 38
q2.13.5.1 <- flights %>%
group_by(tailnum) %>%
filter(n() > 100)
q2.13.5.1
## # A tibble: 229,202 x 19
## # Groups: tailnum [1,201]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 544 545 -1 1004 1022
## 4 2013 1 1 554 558 -4 740 728
## 5 2013 1 1 555 600 -5 913 854
## 6 2013 1 1 557 600 -3 709 723
## 7 2013 1 1 557 600 -3 838 846
## 8 2013 1 1 558 600 -2 849 851
## 9 2013 1 1 558 600 -2 853 856
## 10 2013 1 1 558 600 -2 923 937
## # … with 229,192 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
q3.13.5.1 <- fueleconomy::vehicles %>%
semi_join(fueleconomy::common, by = c("make", "model"))
q3.13.5.1
## # A tibble: 14,531 x 12
## id make model year class trans drive cyl displ fuel hwy cty
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1833 Acura Integ… 1986 Subcom… Autom… Front-… 4 1.6 Regu… 28 22
## 2 1834 Acura Integ… 1986 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 3 3037 Acura Integ… 1987 Subcom… Autom… Front-… 4 1.6 Regu… 28 22
## 4 3038 Acura Integ… 1987 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 5 4183 Acura Integ… 1988 Subcom… Autom… Front-… 4 1.6 Regu… 27 22
## 6 4184 Acura Integ… 1988 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 7 5303 Acura Integ… 1989 Subcom… Autom… Front-… 4 1.6 Regu… 27 22
## 8 5304 Acura Integ… 1989 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 9 6442 Acura Integ… 1990 Subcom… Autom… Front-… 4 1.8 Regu… 24 20
## 10 6443 Acura Integ… 1990 Subcom… Manua… Front-… 4 1.8 Regu… 26 21
## # … with 14,521 more rows
Although, cross-referencing this information did not turn up any results, it appears to be that high precipitation, high pressure and high wind direction were recorded around June 30th and July 1st 2013 which may have caused flight delays within a 48 hour period in 2013.
p4a.13.5.1 <- flights %>%
ggplot(aes(x=time_hour, y=arr_delay)) +
geom_smooth(color = "#0f4c5c") +
labs(title = "Worst Delays over the course of 2013",
subtitle = "Within a 48 hour period",
caption = "Dataset: nycflights13") +
xlab("Jan 2013 - Dec 2013") +
ylab("Delay")
p4a.13.5.1
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 9430 rows containing non-finite values (stat_smooth).
p4b.13.5.1 <- flights %>%
filter((month == 6 & day %in% c(29,30)) | (month == 7 & day %in% c(1,2))) %>%
ggplot(aes(x=time_hour, y=arr_delay)) +
geom_smooth(color = "#0f4c5c") +
labs(title = "Worst Delays over the course of 2013",
subtitle = "Within a 48 hour period",
caption = "Dataset: nycflights13") +
xlab("Date (from 06/30 to 07/02)") +
ylab("Delay")
p4b.13.5.1
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 221 rows containing non-finite values (stat_smooth).
q4.13.5.1 <- flights %>%
filter(time_hour > "2013-06-30 07:00:00" & time_hour < "2013-07-01 13:00:00" ) %>%
left_join(weather, by = c("origin", "time_hour"))
q4.13.5.1
## # A tibble: 1,244 x 32
## year.x month.x day.x dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 6 30 12 2231 101 352
## 2 2013 6 30 21 2300 81 116
## 3 2013 6 30 23 2055 208 123
## 4 2013 6 30 25 2359 26 413
## 5 2013 6 30 43 2250 113 150
## 6 2013 6 30 56 2245 131 201
## 7 2013 6 30 116 2359 77 451
## 8 2013 6 30 153 2245 188 422
## 9 2013 6 30 217 2359 138 545
## 10 2013 6 30 754 800 -6 1143
## # … with 1,234 more rows, and 25 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour.x <dbl>, minute <dbl>,
## # time_hour <dttm>, year.y <int>, month.y <int>, day.y <int>, hour.y <int>,
## # temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>, wind_speed <dbl>,
## # wind_gust <dbl>, precip <dbl>, pressure <dbl>, visib <dbl>
p4c.13.5.1 <- q4.13.5.1 %>%
gather(key = "weather", value = "amount",
temp, dewp, humid, wind_dir, wind_speed, precip, pressure, visib) %>%
ggplot(aes(x = time_hour, y = amount, color = weather)) +
geom_smooth(na.rm = TRUE) +
labs(title = "Worst Delays over the course of 2013",
subtitle = "Within a 48 hour period",
caption = "Dataset: nycflights13") +
xlab("Time of the Day") +
ylab("Weather Stats") +
theme(axis.text.x = element_text(angle = 80, vjust = .55)) +
facet_wrap(vars(weather), scales = "free", ncol = 4) +
scale_color_manual(name = "Weather Condition",
labels = c("Dew Point", "Humidity", "Precipitation", "Pressure", "Temperature", "Visibility", "Wind Direction", "Wind Speed") ,
values = c("#b7094c", "#a01a58", "#892b64", "#723c70", "#5c4d7d", "#455e89", "#1780a1", "#0091ad"))
p4c.13.5.1
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
The expression anti_join(flights, airports, by = c(“dest” = “faa”)) returns the flights that went to an airport that is not in the FAA list of destinations. The expression anti_join(airports, flights, by = c(“faa” = “dest”)) returns the US airports that were not the destination of any flight in the data.
The hypothesis that each plane is flown by a single airline should be rejected because as seen below, there were 17 planes that were flown by over 1 carrier.
q6.13.5.1 <- flights %>%
select(carrier, tailnum) %>%
group_by(tailnum) %>%
summarize(n = length(unique(carrier))) %>%
filter(n > 1)
q6.13.5.1
## # A tibble: 18 x 2
## tailnum n
## <chr> <int>
## 1 N146PQ 2
## 2 N153PQ 2
## 3 N176PQ 2
## 4 N181PQ 2
## 5 N197PQ 2
## 6 N200PQ 2
## 7 N228PQ 2
## 8 N232PQ 2
## 9 N933AT 2
## 10 N935AT 2
## 11 N977AT 2
## 12 N978AT 2
## 13 N979AT 2
## 14 N981AT 2
## 15 N989AT 2
## 16 N990AT 2
## 17 N994AT 2
## 18 <NA> 7