Load libraries for tidyr and dplyr

library("tidyr")
library(dplyr)
library(ggplot2)

Read info from csv into R

filename <- c("https://raw.githubusercontent.com/nobieyi00/CUNY_MSDA_R/master/flight_dataset.csv")

flight_dataset_df <- read.csv(filename,stringsAsFactors = FALSE)
flight_dataset_df
##   Airline Arrival_Status Los.Angeles Phoenix San.Diego San.Franciso
## 1  ALASKA        On time         497     221       212          503
## 2  ALASKA        Delayed          62      12        20          102
## 3  AMWEST        On time         694    4840       383          320
## 4  AMWEST        Delayed         117     415        65          129
##   Seattle
## 1    1841
## 2     305
## 3     201
## 4      61

Tidy data

Narrow down the table

Flight_tidy <- tbl_df(flight_dataset_df)

#Narrow down the table structure

flight_freq <- Flight_tidy %>% 
  gather (State,frequency,-Airline,-Arrival_Status)

flight_freq
## # A tibble: 20 × 4
##    Airline Arrival_Status        State frequency
##      <chr>          <chr>        <chr>     <int>
## 1   ALASKA        On time  Los.Angeles       497
## 2   ALASKA        Delayed  Los.Angeles        62
## 3   AMWEST        On time  Los.Angeles       694
## 4   AMWEST        Delayed  Los.Angeles       117
## 5   ALASKA        On time      Phoenix       221
## 6   ALASKA        Delayed      Phoenix        12
## 7   AMWEST        On time      Phoenix      4840
## 8   AMWEST        Delayed      Phoenix       415
## 9   ALASKA        On time    San.Diego       212
## 10  ALASKA        Delayed    San.Diego        20
## 11  AMWEST        On time    San.Diego       383
## 12  AMWEST        Delayed    San.Diego        65
## 13  ALASKA        On time San.Franciso       503
## 14  ALASKA        Delayed San.Franciso       102
## 15  AMWEST        On time San.Franciso       320
## 16  AMWEST        Delayed San.Franciso       129
## 17  ALASKA        On time      Seattle      1841
## 18  ALASKA        Delayed      Seattle       305
## 19  AMWEST        On time      Seattle       201
## 20  AMWEST        Delayed      Seattle        61

Normalize flight data set so that we can see for each airline and state, the frequencies of ontime and delays

on_time_ds<- select (
        filter(flight_freq, Arrival_Status=='On time' )
        ,Airline,State,On_time_freq=frequency
       )

delayed_ds <-select (
        filter(flight_freq, Arrival_Status=='Delayed' )
        ,Airline,State,Delayed_freq=frequency
       )
flight_ds <-inner_join(on_time_ds, delayed_ds, by = c("Airline","State"))
flight_ds
## # A tibble: 10 × 4
##    Airline        State On_time_freq Delayed_freq
##      <chr>        <chr>        <int>        <int>
## 1   ALASKA  Los.Angeles          497           62
## 2   AMWEST  Los.Angeles          694          117
## 3   ALASKA      Phoenix          221           12
## 4   AMWEST      Phoenix         4840          415
## 5   ALASKA    San.Diego          212           20
## 6   AMWEST    San.Diego          383           65
## 7   ALASKA San.Franciso          503          102
## 8   AMWEST San.Franciso          320          129
## 9   ALASKA      Seattle         1841          305
## 10  AMWEST      Seattle          201           61

Mutate Result set to calculate Ratio of delays per Airline, per state

Ratio_delay_ds <- mutate(flight_ds, Total_freq= On_time_freq + Delayed_freq, 
        Average_Delays = Delayed_freq/Total_freq)

Rearrange / sort dataset by the delay ratio to see Airline that performs worse

arrange(Ratio_delay_ds,State,Average_Delays)  
## # A tibble: 10 × 6
##    Airline        State On_time_freq Delayed_freq Total_freq
##      <chr>        <chr>        <int>        <int>      <int>
## 1   ALASKA  Los.Angeles          497           62        559
## 2   AMWEST  Los.Angeles          694          117        811
## 3   ALASKA      Phoenix          221           12        233
## 4   AMWEST      Phoenix         4840          415       5255
## 5   ALASKA    San.Diego          212           20        232
## 6   AMWEST    San.Diego          383           65        448
## 7   ALASKA San.Franciso          503          102        605
## 8   AMWEST San.Franciso          320          129        449
## 9   ALASKA      Seattle         1841          305       2146
## 10  AMWEST      Seattle          201           61        262
## # ... with 1 more variables: Average_Delays <dbl>

We start noticing that the AMWEST airline performs worse per state.

ggplot(Ratio_delay_ds,aes(x = State, y =Average_Delays))+
      geom_point(aes(color=Airline),               # colour depends on cond2
               size=3) 

Flight_graph <-mutate(Ratio_delay_ds, Airline_State= paste(Airline, State, sep ="_")) 



ggplot(Flight_graph, aes(x=State,y=Average_Delays)) + geom_boxplot(aes(color=Airline))

In summary, we can say that the ALASKA airline is more on time

Find total sum of average of the delay for both airline

Flight_summary <- Ratio_delay_ds %>% 
  group_by(Airline) %>% 
  summarise(Total_average_Delay = sum(Average_Delays))

Flight_summary
## # A tibble: 2 × 2
##   Airline Total_average_Delay
##     <chr>               <dbl>
## 1  ALASKA           0.5593413
## 2  AMWEST           0.8884576
ggplot(Flight_summary, aes(y=Total_average_Delay,x=Airline))  + geom_boxplot(aes(color= Airline),size =3)

Most delays comes from AMWEST airline

Just for curiosity, which city experienced the most delays?

City_summary <- Ratio_delay_ds %>% 
  group_by(State) %>% 
  summarise(Total_average_Delay = sum(Average_Delays))

arrange(City_summary,Total_average_Delay)
## # A tibble: 5 × 2
##          State Total_average_Delay
##          <chr>               <dbl>
## 1      Phoenix           0.1304746
## 2    San.Diego           0.2312962
## 3  Los.Angeles           0.2551787
## 4      Seattle           0.3749493
## 5 San.Franciso           0.4559002

Represent in a box plot

ggplot(City_summary, aes(y=Total_average_Delay,x=State))  + geom_boxplot(aes(color= State),size =3)

We can see it is San Franciso

03/09/2017

Simpson Paradox we can see that the trend reverses after we aggregate the result set to Airline level

 sp<- Ratio_delay_ds %>%
  group_by(Airline) %>%
  summarise(Total_ontime_freq =sum(On_time_freq), Total_Delay_freq = sum(Delayed_freq), Total_entire_freq = sum(Total_freq)) %>%
  mutate(Delay_total_avg = Total_Delay_freq/Total_entire_freq)
sp
## # A tibble: 2 × 5
##   Airline Total_ontime_freq Total_Delay_freq Total_entire_freq
##     <chr>             <int>            <int>             <int>
## 1  ALASKA              3274              501              3775
## 2  AMWEST              6438              787              7225
## # ... with 1 more variables: Delay_total_avg <dbl>

We can see that the AMWEST airline had a better performance overall while ALASKA had more delays overall.

ggplot(sp, aes(y=Airline,x=Delay_total_avg))  + geom_boxplot(aes(color= Airline),size =3)