# Load packages
#install.packages("statsr")
library(statsr)
## Loading required package: BayesFactor
## Loading required package: coda
## Loading required package: Matrix
## ************
## Welcome to BayesFactor 0.9.12-4.3. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
##
## Type BFManual() to open the manual.
## ************
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data(nycflights)
str(nycflights)
## tibble [32,735 x 16] (S3: tbl_df/data.frame)
## $ year : int [1:32735] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:32735] 6 5 12 5 7 1 12 8 9 4 ...
## $ day : int [1:32735] 30 7 8 14 21 1 9 13 26 30 ...
## $ dep_time : int [1:32735] 940 1657 859 1841 1102 1817 1259 1920 725 1323 ...
## $ dep_delay: num [1:32735] 15 -3 -1 -4 -3 -3 14 85 -10 62 ...
## $ arr_time : int [1:32735] 1216 2104 1238 2122 1230 2008 1617 2032 1027 1549 ...
## $ arr_delay: num [1:32735] -4 10 11 -34 -8 3 22 71 -8 60 ...
## $ carrier : chr [1:32735] "VX" "DL" "DL" "DL" ...
## $ tailnum : chr [1:32735] "N626VA" "N3760C" "N712TW" "N914DL" ...
## $ flight : int [1:32735] 407 329 422 2391 3652 353 1428 1407 2279 4162 ...
## $ origin : chr [1:32735] "JFK" "JFK" "JFK" "JFK" ...
## $ dest : chr [1:32735] "LAX" "SJU" "LAX" "TPA" ...
## $ air_time : num [1:32735] 313 216 376 135 50 138 240 48 148 110 ...
## $ distance : num [1:32735] 2475 1598 2475 1005 296 ...
## $ hour : num [1:32735] 9 16 8 18 11 18 12 19 7 13 ...
## $ minute : num [1:32735] 40 57 59 41 2 17 59 20 25 23 ...
Analysis
Departure delays in flights to Raleigh-Durham (RDU)
ggplot(data = nycflights, aes(x = dep_delay)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Focus on departure delays of flights headed to RDU only, first filter the data for flights headed to RDU (dest == "RDU") and then make a histogram of only departure delays of only those flights.
rdu_flights <- nycflights %>%
filter(dest == "RDU")
ggplot(data = rdu_flights, aes(x = dep_delay)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Obtain numerical summaries for these flights:
rdu_flights %>%
summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n())
## # A tibble: 1 x 3
## mean_dd sd_dd n
## <dbl> <dbl> <int>
## 1 11.7 35.6 801
On time departure rate for NYC airports
In order to determine which airport has the best on time departure rate, we need to
• first classify each flight as “on time” or “delayed” (Suppose on time = delayed for less than 5 minutes. Otherwise, it is “delayed”.),
• then group flights by origin airport (three airports in NYC),
• then calculate on time departure rates for each origin airport,
• and finally arrange the airports in descending order for on time departure percentage.
This analysis starts with classifying each flight as “on time” or “delayed” by creating a new variable with the mutate function.
Then, count up how many records of the currently found group are on time - sum(dep_type == “on time”) - and divide that result by the total number of elements in the currently found group - n() - to get a proportion, then to store the answer in a new variable called ot_dep_rate.
Finally, visualize the distribution of on time departure rate across the three airports using a segmented bar plot.
nycflights <- nycflights %>%
mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))
nycflights %>%
group_by(origin) %>%
summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>%
arrange(desc(ot_dep_rate))
## # A tibble: 3 x 2
## origin ot_dep_rate
## <chr> <dbl>
## 1 LGA 0.728
## 2 JFK 0.694
## 3 EWR 0.637
nycflights <- nycflights %>%
mutate(ontime = dep_delay < 5)
nycflights <- nycflights %>%
mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))
nycflights %>% group_by(origin) %>% summarise(ontime_prop = sum(ontime == TRUE) / n()) %>%
arrange(desc(ontime_prop))
## # A tibble: 3 x 2
## origin ontime_prop
## <chr> <dbl>
## 1 LGA 0.728
## 2 JFK 0.694
## 3 EWR 0.637
#ggplot(data = nycflights, aes(x = origin, fill = dep_type)) + geom_bar()
ggplot(data = nycflights, aes(x = origin, fill = dep_type)) +
ggtitle("The distribution of on time and delay departure rate across the NYC three airports") +
labs(x = "Origin Depart", y = "Count of Flights") +
geom_bar()

Simplelize to analize and vitualize airline company flight arrive delay and depart delay in NYC
This is to simplelize to pick only three airline compies - AA, DL and UA to vitualize their flight arrive delay and depart delay in NYC.
dl_aa_ua <- nycflights %>%
filter(carrier == "AA" | carrier == "DL" | carrier == "UA")
#ggplot(dl_aa_ua, aes(x = dep_delay, y = arr_delay, color = carrier)) +
# xlim(-25, 100) +
# geom_point()
ggplot(dl_aa_ua, aes(x = dep_delay, y = arr_delay, color = carrier)) +
xlim(-25, 100) +
labs(x = "Depart delay (min)", y = "Arrive delay (min)") +
ggtitle("NYC Flight Arrive Delay \nAnd NYC Flight Depart Delay") +
geom_point()
## Warning: Removed 444 rows containing missing values (geom_point).
