#What time of day should you fly if you want to avoid delays as much as possible? Does this choice depend on anything? #Season? Weather? Airport? Airline? Find three patterns (“null results” are ok!).
#Write your results into Rmarkdown.
#Include a short introduction that summarizes the three results. Then, have a section for each finding.
#Support each finding with data summaries and visualizations. Include your code when necessary.
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.4 v purrr 0.3.4
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
library(nycflights13)
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
airline <- not_cancelled %>%
filter(arr_delay>0) %>%
group_by(origin) %>%
arrange(desc(arr_delay))%>%
summarise(delay = mean(arr_delay, na.rm=TRUE), n = n())
flights1 <- flights %>%
filter(arr_delay>0)
#ggplot(flights1, aes(x = arr_delay, fill = carrier, colour = carrier)) +
# geom_histogram(alpha = 0.5)
ggplot(flights1, aes(x=reorder(x = carrier,arr_delay),y = arr_delay),fill = carrier, colour = carrier) + geom_boxplot()+ggtitle("Pattern 1: Average Delay by Airline")+ coord_flip(ylim = c(0,80)) + labs(y="Carrier", x="Average Delay(min)")

# First pattern: OO, YV, 9E is the airline more likely to delay compared with other airlines.
ggplot(flights1, aes(x=reorder(x = origin,arr_delay),y = arr_delay),fill = origin, colour = origin) + geom_boxplot()+ggtitle("Pattern 2: Average Delay by airports")+ coord_flip(ylim = c(0,80)) + labs(y="Origin Airports", x="Average Delay(min)")

# Second pattern: EWR have most delay compared with JFK, follow by LGA.
airline1 <- flights %>%
filter(arr_delay>0) %>%
mutate(hour = hour(time_hour)) %>%
group_by(hour)%>%
summarize(arr_delay=mean(arr_delay, na.rm = T))
ggplot(airline1, aes(x=reorder(x = hour,arr_delay),y = arr_delay),fill = hour, colour = hour) + geom_boxplot()+ggtitle("Pattern 3: Average Delay by time of a day")+ coord_flip(ylim = c(0,60)) + labs(y="Average Delay(min)", x="hour in a day")

# Pattern 3 in the morning departure hours and, delay is less than afternoon and evening hours.
weather2 <-
weather %>%
mutate(date = make_date(year,month,day))%>%
group_by(date,origin)%>%
summarise(min_wnd = min(wind_speed, na.rm = T), mean_wnd = mean(wind_speed,na.rm = T))
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
flights %>%
filter(arr_delay>0) %>%
mutate(date = make_date(year,month,day)) %>%
group_by(date, origin) %>%
summarise(arr_delay=mean(arr_delay, na.rm = T))%>%
left_join(weather2) %>%
ggplot(aes(x = min_wnd,y = arr_delay))+
geom_point()+
geom_smooth(se = F, method = "lm")+
facet_wrap(~origin)+
labs(y="wind speed", x="average delay", title = "Average delay relation with wind speed")
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
## Joining, by = c("date", "origin")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

# pattern 4, wind speed is related to average delay on and EWR, but not incremental relation found on wind speed relation in JFK and LGA,
# wind speed in LGA appeared to be the least related, while LGA has the least delay, so, the conclusion is wind speed realation to average delay is null.
# conclude 3 patterns, Choose US, HA, AS as Carrier, LGA as departure airport, and morning hours as departure time can largely avoid delay.
airline3 <-flights %>%
filter(carrier == c("US","HA","AS"),hour(time_hour) == c(5,6,7,8), origin == "EWR",arr_delay>0)%>%
group_by(carrier,origin,hour)%>%
summarize(arr_delay=mean(arr_delay, na.rm = T))%>%
arrange(desc(arr_delay))
## Warning in carrier == c("US", "HA", "AS"): longer object length is not a
## multiple of shorter object length
## `summarise()` has grouped output by 'carrier', 'origin'. You can override using the `.groups` argument.
airline3
## # A tibble: 4 x 4
## # Groups: carrier, origin [2]
## carrier origin hour arr_delay
## <chr> <chr> <dbl> <dbl>
## 1 US EWR 8 16
## 2 US EWR 6 15.8
## 3 US EWR 5 13.3
## 4 AS EWR 7 12.5