knitr::opts_chunk$set(echo = TRUE)

Load packages and data

# Load packages
library(tidyverse) 
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# Load data
d <- read_csv("flight_delay_clean.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   airline = col_character(),
##   departure_date = col_date(format = ""),
##   origin = col_character(),
##   destination = col_character(),
##   route_code = col_character(),
##   scheduled_departure = col_time(format = ""),
##   scheduled_arrival = col_time(format = ""),
##   actual_arrival = col_time(format = ""),
##   scheduled_flight_length = col_double(),
##   actual_flight_length = col_double(),
##   delay = col_double(),
##   delay_indicator = col_double(),
##   day_of_week = col_double()
## )
# Inspect data
glimpse(d)
## Rows: 360
## Columns: 13
## $ airline                 <chr> "RegionEx", "RegionEx", "RegionEx", "Region...
## $ departure_date          <date> 2008-09-01, 2008-09-01, 2008-09-01, 2008-0...
## $ origin                  <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "...
## $ destination             <chr> "MSY", "MSY", "MSY", "MSY", "MSY", "MSY", "...
## $ route_code              <chr> "DFW/MSY", "DFW/MSY", "DFW/MSY", "DFW/MSY",...
## $ scheduled_departure     <time> 09:10:00, 13:10:00, 18:10:00, 09:10:00, 13...
## $ scheduled_arrival       <time> 10:40:00, 14:40:00, 19:40:00, 10:40:00, 14...
## $ actual_arrival          <time> 11:00:00, 15:00:00, 19:58:00, 10:50:00, 14...
## $ scheduled_flight_length <dbl> 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90,...
## $ actual_flight_length    <dbl> 110, 110, 108, 100, 101, 100, 99, 99, 99, 1...
## $ delay                   <dbl> 20, 20, 18, 10, 11, 10, 9, 9, 9, 10, 10, 9,...
## $ delay_indicator         <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ day_of_week             <dbl> 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6...
summary(d)
##    airline          departure_date          origin          destination       
##  Length:360         Min.   :2008-09-01   Length:360         Length:360        
##  Class :character   1st Qu.:2008-09-08   Class :character   Class :character  
##  Mode  :character   Median :2008-09-15   Mode  :character   Mode  :character  
##                     Mean   :2008-09-15                                        
##                     3rd Qu.:2008-09-23                                        
##                     Max.   :2008-09-30                                        
##   route_code        scheduled_departure scheduled_arrival actual_arrival   
##  Length:360         Length:360          Length:360        Length:360       
##  Class :character   Class1:hms          Class1:hms        Class1:hms       
##  Mode  :character   Class2:difftime     Class2:difftime   Class2:difftime  
##                     Mode  :numeric      Mode  :numeric    Mode  :numeric   
##                                                                            
##                                                                            
##  scheduled_flight_length actual_flight_length     delay        delay_indicator 
##  Min.   : 70.00          Min.   : 59.00       Min.   :-13.00   Min.   :0.0000  
##  1st Qu.: 75.00          1st Qu.: 86.00       1st Qu.:  5.00   1st Qu.:0.0000  
##  Median : 90.00          Median : 97.00       Median : 11.00   Median :0.0000  
##  Mean   : 85.83          Mean   : 99.91       Mean   : 14.07   Mean   :0.2611  
##  3rd Qu.: 90.00          3rd Qu.:108.25       3rd Qu.: 15.00   3rd Qu.:1.0000  
##  Max.   :100.00          Max.   :243.00       Max.   :153.00   Max.   :1.0000  
##   day_of_week 
##  Min.   :1.0  
##  1st Qu.:2.0  
##  Median :4.0  
##  Mean   :3.9  
##  3rd Qu.:6.0  
##  Max.   :7.0

The dataset has been cleaned for you and is ready for analysis.

Questions

Lightly comment your code and use pipes for readability.

Comment briefly on each of the questions, as directed. Only the the final question requires a lengthier response.

Q1

Compute the mean, median, 90th percentile, and standard deviation of arrival delay minutes for RegionEx flights. Do the same for MDA flights. Which measure of central tendency would be most appropriate for comparing airline performance?

I think you need to look at the difference between the Median and where the Mean and 90th percentile are compared to the Median. Looking at MDA, their delay times are much better overall based on their mean and 90th percentile. RegionEx appears to less consistent on their delay times and without graphing it, it would be difficult to know if the spread has outliers. They have a high SD and their Median is consistent with their 50th percentile so I would want to use the Median.

mean(d$delay[d$airline=="RegionEx"])
## [1] 15.6625
median(d$delay[d$airline=="RegionEx"])
## [1] 9
quantile(d$delay[d$airline=="RegionEx"], probs = seq(.5, .9, by = .1))
## 50% 60% 70% 80% 90% 
##   9  11  13  17  21
sd(d$delay[d$airline=="RegionEx"])
## [1] 27.65036
mean(d$delay[d$airline=="MDA"],)
## [1] 10.9
median(d$delay[d$airline=="MDA"])
## [1] 13
quantile(d$delay[d$airline=="MDA"], probs = seq(.5, .9, by = .1))
##  50%  60%  70%  80%  90% 
## 13.0 13.0 14.0 15.0 16.1
sd(d$delay[d$airline=="MDA"])
## [1] 6.338359

Q2

Inspect the distribution of RegionEx’s arrival delays by constructing a histogram of the number of arrival delay minutes of RegionEx’s flights. Do the same for MDA’s flights. How do these two distributions compare? Hint: use facet_wrap().

Looking at the Histograms, RegionEx has more delays, but for less duration, with a much higher amount of outliers.

##p1<-hist(d$delay[d$airline=="RegionEx"])

##p2<-hist(d$delay[d$airline=="MDA"])

ggplot(d, aes(d$delay)) +
  geom_histogram() +
  facet_wrap(vars(d$airline), labeller = "label_both")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Q3

So far we have considered airline performance in terms of average delay in minutes. However, the performance metrics, as noted in the case description, also include the percentage of delayed flights. Let’s verify that MDA’s COO is correct: does RegionEx have a higher percentage of delayed flights? Yes, based on the below, RegionEx has a higher Percentage of delayed flights.

Here is code to answer that question:

d %>% 
  group_by(d$airline) %>% 
  summarize(n = n(),
            percent_delay = (mean(delay_indicator) * 100) %>% round(1)) 
## # A tibble: 2 x 3
##   `d$airline`     n percent_delay
## * <chr>       <int>         <dbl>
## 1 MDA           120          25.8
## 2 RegionEx      240          26.2

Note that because delay_indicator is numeric (a binary 0/1 variable) calculating the mean of the vector returns the proportion of 1s, which, multiplied by 100, is equivalent to the percentage of delayed flights.

Write your own code to create a table summarizing the percentage of delayed flights by airline and route.

d %>% 
  group_by(d$airline, d$route_code) %>% 
  summarize(n = n(),
            percent_delay = (mean(delay_indicator) * 100) %>% round(1))
## `summarise()` has grouped output by 'd$airline'. You can override using the `.groups` argument.
## # A tibble: 8 x 4
## # Groups:   d$airline [2]
##   `d$airline` `d$route_code`     n percent_delay
##   <chr>       <chr>          <int>         <dbl>
## 1 MDA         DFW/MSY           30          26.7
## 2 MDA         MSY/DFW           30          30  
## 3 MDA         MSY/PNS           30          20  
## 4 MDA         PNS/MSY           30          26.7
## 5 RegionEx    DFW/MSY           90          25.6
## 6 RegionEx    MSY/DFW           90          28.9
## 7 RegionEx    MSY/PNS           30          20  
## 8 RegionEx    PNS/MSY           30          26.7

These tables contain conflicting information. How should you answer the question of whether RegionEx has a higher percentage of delayed flights? Is the the COO correct? And, if not, why not? When you start drilling down in data in a grandular level percentages start shifting. Weights of values change

Q4

Compare the scheduled flight durations for the two airlines on each of their four routes. Compare the actual flight durations. What do you notice? If the two airlines had the same scheduled duration, what impact would this have on their delay records? The flight durations seem to be the same for each airline for each route.

d %>% 
  group_by(d$airline,d$route_code)  %>% 
  summarize(n = n(),mean(d$scheduled_flight_length),mean(d$actual_flight_length))
## `summarise()` has grouped output by 'd$airline'. You can override using the `.groups` argument.
## # A tibble: 8 x 5
## # Groups:   d$airline [2]
##   `d$airline` `d$route_code`     n `mean(d$scheduled_flig~ `mean(d$actual_fligh~
##   <chr>       <chr>          <int>                   <dbl>                 <dbl>
## 1 MDA         DFW/MSY           30                    85.8                  99.9
## 2 MDA         MSY/DFW           30                    85.8                  99.9
## 3 MDA         MSY/PNS           30                    85.8                  99.9
## 4 MDA         PNS/MSY           30                    85.8                  99.9
## 5 RegionEx    DFW/MSY           90                    85.8                  99.9
## 6 RegionEx    MSY/DFW           90                    85.8                  99.9
## 7 RegionEx    MSY/PNS           30                    85.8                  99.9
## 8 RegionEx    PNS/MSY           30                    85.8                  99.9

Q5

Does the data support the claim that the on‐time performance of RegionEx is worse than that of MDA? Write a paragraph in which you argue a position. In your answer, please incorporate the quantitative evidence you have developed above.

Based on the the data above, RegionEx’s delays tend to be less frequent and of lower duration. They have some outliers of higher duration that throw their averages off but overall their on-time performance is better than MDA.