# Load packages
#install.packages("statsr")
library(statsr)
## Loading required package: BayesFactor
## Loading required package: coda
## Loading required package: Matrix
## ************
## Welcome to BayesFactor 0.9.12-4.3. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
## 
## Type BFManual() to open the manual.
## ************
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
data(nycflights)
str(nycflights)
## tibble [32,735 x 16] (S3: tbl_df/data.frame)
##  $ year     : int [1:32735] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month    : int [1:32735] 6 5 12 5 7 1 12 8 9 4 ...
##  $ day      : int [1:32735] 30 7 8 14 21 1 9 13 26 30 ...
##  $ dep_time : int [1:32735] 940 1657 859 1841 1102 1817 1259 1920 725 1323 ...
##  $ dep_delay: num [1:32735] 15 -3 -1 -4 -3 -3 14 85 -10 62 ...
##  $ arr_time : int [1:32735] 1216 2104 1238 2122 1230 2008 1617 2032 1027 1549 ...
##  $ arr_delay: num [1:32735] -4 10 11 -34 -8 3 22 71 -8 60 ...
##  $ carrier  : chr [1:32735] "VX" "DL" "DL" "DL" ...
##  $ tailnum  : chr [1:32735] "N626VA" "N3760C" "N712TW" "N914DL" ...
##  $ flight   : int [1:32735] 407 329 422 2391 3652 353 1428 1407 2279 4162 ...
##  $ origin   : chr [1:32735] "JFK" "JFK" "JFK" "JFK" ...
##  $ dest     : chr [1:32735] "LAX" "SJU" "LAX" "TPA" ...
##  $ air_time : num [1:32735] 313 216 376 135 50 138 240 48 148 110 ...
##  $ distance : num [1:32735] 2475 1598 2475 1005 296 ...
##  $ hour     : num [1:32735] 9 16 8 18 11 18 12 19 7 13 ...
##  $ minute   : num [1:32735] 40 57 59 41 2 17 59 20 25 23 ...

Analysis

Departure delays in flights to Raleigh-Durham (RDU)

ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Focus on departure delays of flights headed to RDU only, first filter the data for flights headed to RDU (dest == "RDU") and then make a histogram of only departure delays of only those flights.
rdu_flights <- nycflights %>%
  filter(dest == "RDU")

ggplot(data = rdu_flights, aes(x = dep_delay)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#  Obtain numerical summaries for these flights:
rdu_flights %>%
  summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n())
## # A tibble: 1 x 3
##   mean_dd sd_dd     n
##     <dbl> <dbl> <int>
## 1    11.7  35.6   801

On time departure rate for NYC airports

In order to determine which airport has the best on time departure rate, we need to

• first classify each flight as “on time” or “delayed” (Suppose on time = delayed for less than 5 minutes. Otherwise, it is “delayed”.),

• then group flights by origin airport (three airports in NYC),

• then calculate on time departure rates for each origin airport,

• and finally arrange the airports in descending order for on time departure percentage.

This analysis starts with classifying each flight as “on time” or “delayed” by creating a new variable with the mutate function.

Then, count up how many records of the currently found group are on time - sum(dep_type == “on time”) - and divide that result by the total number of elements in the currently found group - n() - to get a proportion, then to store the answer in a new variable called ot_dep_rate.

Finally, visualize the distribution of on time departure rate across the three airports using a segmented bar plot.

nycflights <- nycflights %>%
  mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))
nycflights %>%
  group_by(origin) %>%
  summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>%
  arrange(desc(ot_dep_rate))
## # A tibble: 3 x 2
##   origin ot_dep_rate
##   <chr>        <dbl>
## 1 LGA          0.728
## 2 JFK          0.694
## 3 EWR          0.637
nycflights <- nycflights %>% 
        mutate(ontime = dep_delay < 5)
nycflights <- nycflights %>%
  mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))

nycflights %>% group_by(origin) %>% summarise(ontime_prop = sum(ontime == TRUE) / n()) %>% 
                                                arrange(desc(ontime_prop))
## # A tibble: 3 x 2
##   origin ontime_prop
##   <chr>        <dbl>
## 1 LGA          0.728
## 2 JFK          0.694
## 3 EWR          0.637
#ggplot(data = nycflights, aes(x = origin, fill = dep_type)) + geom_bar()

ggplot(data = nycflights, aes(x = origin, fill = dep_type)) +
    ggtitle("The distribution of on time and delay departure rate across the NYC three airports") +
    labs(x = "Origin Depart", y = "Count of Flights") +
   geom_bar()

Simplelize to analize and vitualize airline company flight arrive delay and depart delay in NYC

This is to simplelize to pick only three airline compies - AA, DL and UA to vitualize their flight arrive delay and depart delay in NYC.

dl_aa_ua <- nycflights %>%
  filter(carrier == "AA" | carrier == "DL" | carrier == "UA")

#ggplot(dl_aa_ua, aes(x = dep_delay, y = arr_delay, color = carrier)) +
#        xlim(-25, 100) +
#        geom_point()

ggplot(dl_aa_ua, aes(x = dep_delay, y = arr_delay, color = carrier)) +
        xlim(-25, 100) +
        labs(x = "Depart delay (min)", y = "Arrive delay (min)") +
        ggtitle("NYC Flight Arrive Delay \nAnd NYC Flight Depart Delay") +
        geom_point()
## Warning: Removed 444 rows containing missing values (geom_point).