#What time of day should you fly if you want to avoid delays as much as possible? Does this choice depend on anything? #Season? Weather? Airport? Airline? Find three patterns (“null results” are ok!).
#Write your results into Rmarkdown.
#Include a short introduction that summarizes the three results. Then, have a section for each finding.
#Support each finding with data summaries and visualizations. Include your code when necessary.


require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate) 
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.4     v purrr   0.3.4
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x dplyr::filter()          masks stats::filter()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()
library(nycflights13)

not_cancelled <- flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))

airline <- not_cancelled %>% 
              filter(arr_delay>0) %>%
              group_by(origin) %>% 
              arrange(desc(arr_delay))%>%
              summarise(delay = mean(arr_delay, na.rm=TRUE), n = n())

flights1 <- flights %>%
  filter(arr_delay>0)

#ggplot(flights1, aes(x = arr_delay, fill = carrier, colour = carrier)) + 
#  geom_histogram(alpha = 0.5)

ggplot(flights1, aes(x=reorder(x = carrier,arr_delay),y = arr_delay),fill = carrier, colour = carrier) +    geom_boxplot()+ggtitle("Pattern 1: Average Delay by Airline")+ coord_flip(ylim = c(0,80)) + labs(y="Carrier", x="Average Delay(min)")

# First pattern: OO, YV, 9E is the airline more likely to delay compared with other airlines. 

ggplot(flights1, aes(x=reorder(x = origin,arr_delay),y = arr_delay),fill = origin, colour = origin) +    geom_boxplot()+ggtitle("Pattern 2: Average Delay by airports")+ coord_flip(ylim = c(0,80)) + labs(y="Origin Airports", x="Average Delay(min)")

# Second pattern: EWR have most delay compared with JFK, follow by LGA. 


airline1 <- flights %>% 
              filter(arr_delay>0) %>%
              mutate(hour = hour(time_hour)) %>% 
              group_by(hour)%>% 
              summarize(arr_delay=mean(arr_delay, na.rm = T))
ggplot(airline1, aes(x=reorder(x = hour,arr_delay),y = arr_delay),fill = hour, colour = hour) +    geom_boxplot()+ggtitle("Pattern 3: Average Delay by time of a day")+ coord_flip(ylim = c(0,60)) + labs(y="Average Delay(min)", x="hour in a day")

# Pattern 3 in the morning departure hours and, delay is less than afternoon and evening hours. 
weather2 <- 
  weather %>%
  mutate(date = make_date(year,month,day))%>%
  group_by(date,origin)%>%
  summarise(min_wnd = min(wind_speed, na.rm = T), mean_wnd = mean(wind_speed,na.rm = T))
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
flights %>%
  filter(arr_delay>0) %>%
  mutate(date = make_date(year,month,day)) %>%
  group_by(date, origin) %>%
  summarise(arr_delay=mean(arr_delay, na.rm = T))%>%
  left_join(weather2) %>% 
  ggplot(aes(x = min_wnd,y = arr_delay))+
  geom_point()+
  geom_smooth(se = F, method = "lm")+
  facet_wrap(~origin)+
  labs(y="wind speed", x="average delay", title = "Average delay relation with wind speed")
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
## Joining, by = c("date", "origin")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

# pattern 4, wind speed is related to average delay on and EWR, but not incremental relation found on wind speed relation in JFK and LGA,
# wind speed in LGA appeared to be the least related, while LGA has the least delay, so, the conclusion is wind speed realation to average delay is null. 
# conclude 3 patterns, Choose US, HA, AS as Carrier, LGA as departure airport, and morning hours as departure time can largely avoid delay. 

airline3 <-flights %>%
  filter(carrier == c("US","HA","AS"),hour(time_hour) == c(5,6,7,8), origin == "EWR",arr_delay>0)%>%
  group_by(carrier,origin,hour)%>%
  summarize(arr_delay=mean(arr_delay, na.rm = T))%>%
  arrange(desc(arr_delay))
## Warning in carrier == c("US", "HA", "AS"): longer object length is not a
## multiple of shorter object length
## `summarise()` has grouped output by 'carrier', 'origin'. You can override using the `.groups` argument.
airline3
## # A tibble: 4 x 4
## # Groups:   carrier, origin [2]
##   carrier origin  hour arr_delay
##   <chr>   <chr>  <dbl>     <dbl>
## 1 US      EWR        8      16  
## 2 US      EWR        6      15.8
## 3 US      EWR        5      13.3
## 4 AS      EWR        7      12.5